tesseract  4.0.0-1-g2a2b
tesseract::Tesseract Class Reference

#include <tesseractclass.h>

Inheritance diagram for tesseract::Tesseract:
tesseract::Wordrec tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil

Public Member Functions

 Tesseract ()
 
 ~Tesseract ()
 
DictgetDict () override
 
void Clear ()
 
void ResetAdaptiveClassifier ()
 
void ResetDocumentDictionary ()
 
void SetEquationDetect (EquationDetect *detector)
 
const FCOORDreskew () const
 
Pix ** mutable_pix_binary ()
 
Pix * pix_binary () const
 
Pix * pix_grey () const
 
void set_pix_grey (Pix *grey_pix)
 
Pix * pix_original () const
 
void set_pix_original (Pix *original_pix)
 
Pix * BestPix () const
 
void set_pix_thresholds (Pix *thresholds)
 
int source_resolution () const
 
void set_source_resolution (int ppi)
 
int ImageWidth () const
 
int ImageHeight () const
 
Pix * scaled_color () const
 
int scaled_factor () const
 
void SetScaledColor (int factor, Pix *color)
 
const Textordtextord () const
 
Textordmutable_textord ()
 
bool right_to_left () const
 
int num_sub_langs () const
 
Tesseractget_sub_lang (int index) const
 
bool AnyTessLang () const
 
bool AnyLSTMLang () const
 
void SetBlackAndWhitelist ()
 
void PrepareForPageseg ()
 
void PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
 
int SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
 
void SetupWordScripts (BLOCK_LIST *blocks)
 
int AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
 
ColumnFinderSetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
 
void PrerecAllWordsPar (const GenericVector< WordData > &words)
 
void TrainLineRecognizer (const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
 
void TrainFromBoxes (const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
 
ImageDataGetLineData (const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
 
ImageDataGetRectImage (const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
 
void LSTMRecognizeWord (const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
 
void SearchWords (PointerVector< WERD_RES > *words)
 
bool ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
 
void SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
 
void SetupWordPassN (int pass_n, WordData *word)
 
bool RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
 
bool recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
 
void rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
 
void bigram_correction_pass (PAGE_RES *page_res)
 
void blamer_pass (PAGE_RES *page_res)
 
void script_pos_pass (PAGE_RES *page_res)
 
int RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
 
bool ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
 
void AssignDiacriticsToOverlappingBlobs (const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)
 
void AssignDiacriticsToNewBlobs (const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)
 
bool SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
 
float ClassifyBlobPlusOutlines (const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
 
float ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
 
void classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
 
void classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box)
 
void fix_rep_char (PAGE_RES_IT *page_res_it)
 
ACCEPTABLE_WERD_TYPE acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths)
 
void match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
 
void classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
 
bool RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
 
bool recog_interactive (PAGE_RES_IT *pr_it)
 
void set_word_fonts (WERD_RES *word)
 
void font_recognition_pass (PAGE_RES *page_res)
 
void dictionary_correction_pass (PAGE_RES *page_res)
 
bool check_debug_pt (WERD_RES *word, int location)
 
bool SubAndSuperscriptFix (WERD_RES *word_res)
 
void GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
 
WERD_RESTrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
 
bool BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
 
void output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
 
void write_results (PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
 
void set_unlv_suspects (WERD_RES *word)
 
UNICHAR_ID get_rep_char (WERD_RES *word)
 
bool acceptable_number_string (const char *s, const char *lengths)
 
int16_t count_alphanums (const WERD_CHOICE &word)
 
int16_t count_alphas (const WERD_CHOICE &word)
 
void read_config_file (const char *filename, SetParamConstraint constraint)
 
int init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
int init_tesseract (const char *datapath, const char *language, OcrEngineMode oem)
 
int init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
void SetupUniversalFontIds ()
 
int init_tesseract_lm (const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
 
void recognize_page (STRING &image_name)
 
void end_tesseract ()
 
bool init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
void ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
 
SVMenuNodebuild_menu_new ()
 
void pgeditor_main (int width, int height, PAGE_RES *page_res)
 
void process_image_event (const SVEvent &event)
 
bool process_cmd_win_event (int32_t cmd_event, char *new_value)
 
void debug_word (PAGE_RES *page_res, const TBOX &selection_box)
 
void do_re_display (bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
 
bool word_display (PAGE_RES_IT *pr_it)
 
bool word_bln_display (PAGE_RES_IT *pr_it)
 
bool word_blank_and_set_display (PAGE_RES_IT *pr_its)
 
bool word_set_display (PAGE_RES_IT *pr_it)
 
bool word_dumper (PAGE_RES_IT *pr_it)
 
void blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box)
 
void make_reject_map (WERD_RES *word, ROW *row, int16_t pass)
 
bool one_ell_conflict (WERD_RES *word_res, bool update_map)
 
int16_t first_alphanum_index (const char *word, const char *word_lengths)
 
int16_t first_alphanum_offset (const char *word, const char *word_lengths)
 
int16_t alpha_count (const char *word, const char *word_lengths)
 
bool word_contains_non_1_digit (const char *word, const char *word_lengths)
 
void dont_allow_1Il (WERD_RES *word)
 
int16_t count_alphanums (WERD_RES *word)
 
void flip_0O (WERD_RES *word)
 
bool non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
bool non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
bool repeated_nonalphanum_wd (WERD_RES *word, ROW *row)
 
void nn_match_word (WERD_RES *word, ROW *row)
 
void nn_recover_rejects (WERD_RES *word, ROW *row)
 
void set_done (WERD_RES *word, int16_t pass)
 
int16_t safe_dict_word (const WERD_RES *werd_res)
 
void flip_hyphens (WERD_RES *word)
 
void reject_I_1_L (WERD_RES *word)
 
void reject_edge_blobs (WERD_RES *word)
 
void reject_mostly_rejects (WERD_RES *word)
 
bool word_adaptable (WERD_RES *word, uint16_t mode)
 
void recog_word_recursive (WERD_RES *word)
 
void recog_word (WERD_RES *word)
 
void split_and_recog_word (WERD_RES *word)
 
void split_word (WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
 
void join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
 
void match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block)
 
int16_t fp_eval_word_spacing (WERD_RES_LIST &word_res_list)
 
void dump_words (WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
 
bool fixspace_thinks_word_done (WERD_RES *word)
 
GARBAGE_LEVEL garbage_word (WERD_RES *word, BOOL8 ok_dict_word)
 
bool potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
 
void tilde_crunch (PAGE_RES_IT &page_res_it)
 
void unrej_good_quality_words (PAGE_RES_IT &page_res_it)
 
void doc_and_block_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc)
 
void quality_based_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc)
 
void convert_bad_unlv_chs (WERD_RES *word_res)
 
void tilde_delete (PAGE_RES_IT &page_res_it)
 
int16_t word_blob_quality (WERD_RES *word, ROW *row)
 
void word_char_quality (WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
 
void unrej_good_chs (WERD_RES *word, ROW *row)
 
int16_t count_outline_errs (char c, int16_t outline_count)
 
int16_t word_outline_errs (WERD_RES *word)
 
bool terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level)
 
CRUNCH_MODE word_deletable (WERD_RES *word, int16_t &delete_mode)
 
int16_t failure_count (WERD_RES *word)
 
bool noise_outlines (TWERD *word)
 
void tess_segment_pass_n (int pass_n, WERD_RES *word)
 
PAGE_RESApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
 
void PreenXHeights (BLOCK_LIST *block_list)
 
PAGE_RESSetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
 
void MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
 
bool ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
 
bool ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
 
void ReSegmentByClassification (PAGE_RES *page_res)
 
bool ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
 
bool FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
 
void SearchForText (const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
 
void TidyUp (PAGE_RES *page_res)
 
void ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
 
void CorrectClassifyWords (PAGE_RES *page_res)
 
void ApplyBoxTraining (const STRING &fontname, PAGE_RES *page_res)
 
int CountMisfitTops (WERD_RES *word_res)
 
float ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift)
 
FILE * init_recog_training (const STRING &fname)
 
void recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
 
void ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
 
eval_word_spacing()

The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.

Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is preferred.

The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.

Conversely, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.

The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.

bool digit_or_numeric_punct (WERD_RES *word, int char_position)
 
int16_t eval_word_spacing (WERD_RES_LIST &word_res_list)
 
fix_sp_fp_word()

Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.

void fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
 
int16_t worst_noise_blob (WERD_RES *word_res, float *worst_noise_score)
 
float blob_noise_score (TBLOB *blob)
 
void break_noisiest_blob_word (WERD_RES_LIST &words)
 
fix_fuzzy_spaces()

Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.

Parameters
monitorprogress monitor
word_countcount of words in doc
[out]page_res
void fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_fuzzy_spaces (ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
 
process_selected_words()

Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.

void process_selected_words (PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
 
tess_add_doc_word

Add the given word to the document dictionary

void tess_add_doc_word (WERD_CHOICE *word_choice)
 
tess_acceptable_word
Returns
true if the word is regarded as "good enough".
Parameters
word_choiceafter context
raw_choicebefore context
bool tess_acceptable_word (WERD_RES *word)
 
- Public Member Functions inherited from tesseract::Wordrec
 Wordrec ()
 
virtual ~Wordrec ()=default
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void DoSegSearch (WERD_RES *word_res)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
 
SEAMchop_numbered_blob (TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (SPLIT *split)
 
PRIORITY grade_sharpness (SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const GenericVector< SEAM *> &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, int16_t num_blobs)
 
void get_fragment_lists (int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
void program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
 
void cc_recog (WERD_RES *word)
 
void program_editdown (int32_t elasped_time)
 
void set_pass1 ()
 
void set_pass2 ()
 
int end_recog ()
 
BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
int dict_word (const WERD_CHOICE &word)
 
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
PRIORITY point_priority (EDGEPT *point)
 
void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
void prioritize_points (TESSLINE *outline, PointHeap *points)
 
void new_min_point (EDGEPT *local_min, PointHeap *points)
 
void new_max_point (EDGEPT *local_max, PointHeap *points)
 
void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
SEAMimprove_one_blob (const GenericVector< BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
 
SEAMchop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number)
 
void chop_word_main (WERD_RES *word)
 
void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
 
int select_blob_to_split (const GenericVector< BLOB_CHOICE *> &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
virtual ~Classify ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uint8_t ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, CLASS_CUTOFF_ARRAY Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (TFile *File)
 
float ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (float Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uint8_t *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, float *XScale, float *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()=default
 
virtual ~CCStruct ()
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()=default
 
virtual ~CUtil ()
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Public Attributes

bool tessedit_resegment_from_boxes = false
 
bool tessedit_resegment_from_line_boxes = false
 
bool tessedit_train_from_boxes = false
 
bool tessedit_make_boxes_from_boxes = false
 
bool tessedit_train_line_recognizer = false
 
bool tessedit_dump_pageseg_images = false
 
int tessedit_pageseg_mode = PSM_SINGLE_BLOCK
 
int tessedit_ocr_engine_mode = tesseract::OEM_DEFAULT
 
char * tessedit_char_blacklist = ""
 
char * tessedit_char_whitelist = ""
 
char * tessedit_char_unblacklist = ""
 
bool tessedit_ambigs_training = false
 
int pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
int ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
char * tessedit_write_params_to_file = ""
 
bool tessedit_adaption_debug = false
 
int bidi_debug = 0
 
int applybox_debug = 1
 
int applybox_page = 0
 
char * applybox_exposure_pattern = ".exp"
 
bool applybox_learn_chars_and_char_frags_mode = false
 
bool applybox_learn_ngrams_mode = false
 
bool tessedit_display_outwords = false
 
bool tessedit_dump_choices = false
 
bool tessedit_timing_debug = false
 
bool tessedit_fix_fuzzy_spaces = true
 
bool tessedit_unrej_any_wd = false
 
bool tessedit_fix_hyphens = true
 
bool tessedit_redo_xheight = true
 
bool tessedit_enable_doc_dict = true
 
bool tessedit_debug_fonts = false
 
bool tessedit_debug_block_rejection = false
 
bool tessedit_enable_bigram_correction = true
 
bool tessedit_enable_dict_correction = false
 
int tessedit_bigram_debug = 0
 
bool enable_noise_removal = true
 
int debug_noise_removal = 0
 
double noise_cert_basechar = -8.0
 
double noise_cert_disjoint = -2.5
 
double noise_cert_punc = -2.5
 
double noise_cert_factor = 0.375
 
int noise_maxperblob = 8
 
int noise_maxperword = 16
 
int debug_x_ht_level = 0
 
bool debug_acceptable_wds = false
 
char * chs_leading_punct = "('`\""
 
char * chs_trailing_punct1 = ").,;:?!"
 
char * chs_trailing_punct2 = ")'`\""
 
double quality_rej_pc = 0.08
 
double quality_blob_pc = 0.0
 
double quality_outline_pc = 1.0
 
double quality_char_pc = 0.95
 
int quality_min_initial_alphas_reqd = 2
 
int tessedit_tess_adaption_mode = 0x27
 
bool tessedit_minimal_rej_pass1 = false
 
bool tessedit_test_adaption = false
 
bool tessedit_matcher_log = false
 
int tessedit_test_adaption_mode = 3
 
bool test_pt = false
 
double test_pt_x = 99999.99
 
double test_pt_y = 99999.99
 
int multilang_debug_level = 0
 
int paragraph_debug_level = 0
 
bool paragraph_text_based = true
 
bool lstm_use_matrix = 1
 
char * outlines_odd = "%| "
 
char * outlines_2 = "ij!?%\":;"
 
bool docqual_excuse_outline_errs = false
 
bool tessedit_good_quality_unrej = true
 
bool tessedit_use_reject_spaces = true
 
double tessedit_reject_doc_percent = 65.00
 
double tessedit_reject_block_percent = 45.00
 
double tessedit_reject_row_percent = 40.00
 
double tessedit_whole_wd_rej_row_percent = 70.00
 
bool tessedit_preserve_blk_rej_perfect_wds = true
 
bool tessedit_preserve_row_rej_perfect_wds = true
 
bool tessedit_dont_blkrej_good_wds = false
 
bool tessedit_dont_rowrej_good_wds = false
 
int tessedit_preserve_min_wd_len = 2
 
bool tessedit_row_rej_good_docs = true
 
double tessedit_good_doc_still_rowrej_wd = 1.1
 
bool tessedit_reject_bad_qual_wds = true
 
bool tessedit_debug_doc_rejection = false
 
bool tessedit_debug_quality_metrics = false
 
bool bland_unrej = false
 
double quality_rowrej_pc = 1.1
 
bool unlv_tilde_crunching = false
 
bool hocr_font_info = false
 
bool crunch_early_merge_tess_fails = true
 
bool crunch_early_convert_bad_unlv_chs = false
 
double crunch_terrible_rating = 80.0
 
bool crunch_terrible_garbage = true
 
double crunch_poor_garbage_cert = -9.0
 
double crunch_poor_garbage_rate = 60
 
double crunch_pot_poor_rate = 40
 
double crunch_pot_poor_cert = -8.0
 
bool crunch_pot_garbage = true
 
double crunch_del_rating = 60
 
double crunch_del_cert = -10.0
 
double crunch_del_min_ht = 0.7
 
double crunch_del_max_ht = 3.0
 
double crunch_del_min_width = 3.0
 
double crunch_del_high_word = 1.5
 
double crunch_del_low_word = 0.5
 
double crunch_small_outlines_size = 0.6
 
int crunch_rating_max = 10
 
int crunch_pot_indicators = 1
 
bool crunch_leave_ok_strings = true
 
bool crunch_accept_ok = true
 
bool crunch_leave_accept_strings = false
 
bool crunch_include_numerals = false
 
int crunch_leave_lc_strings = 4
 
int crunch_leave_uc_strings = 4
 
int crunch_long_repetitions = 3
 
int crunch_debug = 0
 
int fixsp_non_noise_limit = 1
 
double fixsp_small_outlines_size = 0.28
 
bool tessedit_prefer_joined_punct = false
 
int fixsp_done_mode = 1
 
int debug_fix_space_level = 0
 
char * numeric_punctuation = ".,"
 
int x_ht_acceptance_tolerance = 8
 
int x_ht_min_change = 8
 
int superscript_debug = 0
 
double superscript_worse_certainty = 2.0
 
double superscript_bettered_certainty = 0.97
 
double superscript_scaledown_ratio = 0.4
 
double subscript_max_y_top = 0.5
 
double superscript_min_y_bottom = 0.3
 
bool tessedit_write_block_separators = false
 
bool tessedit_write_rep_codes = false
 
bool tessedit_write_unlv = false
 
bool tessedit_create_txt = false
 
bool tessedit_create_hocr = false
 
bool tessedit_create_tsv = false
 
bool tessedit_create_pdf = false
 
bool textonly_pdf = false
 
int jpg_quality = 85
 
int user_defined_dpi = 0
 
int min_characters_to_try = 50
 
char * unrecognised_char = "|"
 
int suspect_level = 99
 
int suspect_space_level = 100
 
int suspect_short_words = 2
 
bool suspect_constrain_1Il = false
 
double suspect_rating_per_ch = 999.9
 
double suspect_accept_rating = -999.9
 
bool tessedit_minimal_rejection = false
 
bool tessedit_zero_rejection = false
 
bool tessedit_word_for_word = false
 
bool tessedit_zero_kelvin_rejection = false
 
bool tessedit_consistent_reps = true
 
int tessedit_reject_mode = 0
 
bool tessedit_rejection_debug = false
 
bool tessedit_flip_0O = true
 
double tessedit_lower_flip_hyphen = 1.5
 
double tessedit_upper_flip_hyphen = 1.8
 
bool rej_trust_doc_dawg = false
 
bool rej_1Il_use_dict_word = false
 
bool rej_1Il_trust_permuter_type = true
 
bool rej_use_tess_accepted = true
 
bool rej_use_tess_blanks = true
 
bool rej_use_good_perm = true
 
bool rej_use_sensible_wd = false
 
bool rej_alphas_in_number_perm = false
 
double rej_whole_of_mostly_reject_word_fract = 0.85
 
int tessedit_image_border = 2
 
char * ok_repeated_ch_non_alphanum_wds = "-?*\075"
 
char * conflict_set_I_l_1 = "Il1[]"
 
int min_sane_x_ht_pixels = 8
 
bool tessedit_create_boxfile = false
 
int tessedit_page_number = -1
 
bool tessedit_write_images = false
 
bool interactive_display_mode = false
 
char * file_type = ".tif"
 
bool tessedit_override_permuter = true
 
char * tessedit_load_sublangs = ""
 
bool tessedit_use_primary_params_model = false
 
double min_orientation_margin = 7.0
 
bool textord_tabfind_show_vlines = false
 
bool textord_use_cjk_fp_model = FALSE
 
bool poly_allow_detailed_fx = false
 
bool tessedit_init_config_only = false
 
bool textord_equation_detect = false
 
bool textord_tabfind_vertical_text = true
 
bool textord_tabfind_force_vertical_text = false
 
double textord_tabfind_vertical_text_ratio = 0.5
 
double textord_tabfind_aligned_gap_fraction = 0.75
 
int tessedit_parallelize = 0
 
bool preserve_interword_spaces = false
 
char * page_separator = "\f"
 
int lstm_choice_mode = 0
 
- Public Attributes inherited from tesseract::Wordrec
bool merge_fragments_in_matrix = TRUE
 
bool wordrec_no_block = FALSE
 
bool wordrec_enable_assoc = TRUE
 
bool force_word_assoc = FALSE
 
double wordrec_worst_state = 1
 
bool fragments_guide_chopper = FALSE
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_seam_pile_size = 150
 
bool chop_new_seam_pile = 1
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
int chop_centered_maxwidth = 90
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
int segment_adjust_debug = 0
 
bool assume_fixed_pitch_char_segment = FALSE
 
int wordrec_debug_level = 0
 
int wordrec_max_join_chunks = 4
 
bool wordrec_skip_no_truth_words = false
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
bool save_alt_choices = true
 
std::unique_ptr< LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< int > blame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool allow_blob_division = true
 
bool prioritize_division = FALSE
 
int tessedit_single_match = FALSE
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_min_norm_scale_x = 0.0
 
double classify_max_norm_scale_x = 0.325
 
double classify_min_norm_scale_y = 0.0
 
double classify_max_norm_scale_y = 0.325
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = TRUE
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = FALSE
 
bool matcher_debug_separate_windows = FALSE
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
bool EnableLearning
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
int il1_adaption_test = 0
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
int ambigs_debug_level = 0
 
bool use_definite_ambigs_for_classifier = 0
 
bool use_ambigs_for_adaption = 0
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Member Functions inherited from tesseract::Wordrec
bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Detailed Description

Definition at line 173 of file tesseractclass.h.

Constructor & Destructor Documentation

◆ Tesseract()

tesseract::Tesseract::Tesseract ( )

Definition at line 54 of file tesseractclass.cpp.

56  "Take segmentation and labeling from box file",
57  this->params()),
59  "Conversion of word/line box file to char box file",
60  this->params()),
62  "Generate training data from boxed chars", this->params()),
64  "Generate more boxes from boxed chars", this->params()),
66  "Break input into lines and remap boxes if present",
67  this->params()),
69  "Dump intermediate images made during page segmentation",
70  this->params()),
71  // The default for pageseg_mode is the old behaviour, so as not to
72  // upset anything that relies on that.
73  INT_MEMBER(
75  "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
76  " 5=line, 6=word, 7=char"
77  " (Values from PageSegMode enum in publictypes.h)",
78  this->params()),
80  "Which OCR engine(s) to run (Tesseract, LSTM, both)."
81  " Defaults to loading and running the most accurate"
82  " available.",
83  this->params()),
85  "Blacklist of chars not to recognize", this->params()),
87  "Whitelist of chars to recognize", this->params()),
89  "List of chars to override tessedit_char_blacklist",
90  this->params()),
92  "Perform training for ambiguities", this->params()),
95  "Whether to use the top-line splitting process for Devanagari "
96  "documents while performing page-segmentation.",
97  this->params()),
100  "Whether to use the top-line splitting process for Devanagari "
101  "documents while performing ocr.",
102  this->params()),
104  "Write all parameters to the given file.", this->params()),
106  "Generate and print debug"
107  " information for adaption",
108  this->params()),
109  INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
110  INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
111  INT_MEMBER(applybox_page, 0, "Page number to apply boxes from",
112  this->params()),
114  "Exposure value follows"
115  " this pattern in the image filename. The name of the image"
116  " files are expected to be in the form"
117  " [lang].[fontname].exp[num].tif",
118  this->params()),
120  "Learn both character fragments (as is done in the"
121  " special low exposure mode) as well as unfragmented"
122  " characters.",
123  this->params()),
125  "Each bounding box"
126  " is assumed to contain ngrams. Only learn the ngrams"
127  " whose outlines overlap horizontally.",
128  this->params()),
129  BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words",
130  this->params()),
131  BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices",
132  this->params()),
133  BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
134  this->params()),
136  "Try to improve fuzzy spaces", this->params()),
138  "Don't bother with word plausibility", this->params()),
139  BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
140  this->params()),
141  BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
142  this->params()),
144  "Add words to the document dictionary", this->params()),
145  BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char",
146  this->params()),
147  BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats",
148  this->params()),
150  "Enable correction based on the word bigram dictionary.",
151  this->params()),
153  "Enable single word correction based on the dictionary.",
154  this->params()),
156  "Amount of debug output for bigram correction.",
157  this->params()),
159  "Remove and conditionally reassign small outlines when they"
160  " confuse layout analysis, determining diacritics vs noise",
161  this->params()),
162  INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines",
163  this->params()),
164  // Worst (min) certainty, for which a diacritic is allowed to make the
165  // base
166  // character worse and still be included.
168  "Hingepoint for base char certainty", this->params()),
169  // Worst (min) certainty, for which a non-overlapping diacritic is allowed
170  // to make the base character worse and still be included.
172  "Hingepoint for disjoint certainty", this->params()),
173  // Worst (min) certainty, for which a diacritic is allowed to make a new
174  // stand-alone blob.
176  "Threshold for new punc char certainty", this->params()),
177  // Factor of certainty margin for adding diacritics to not count as worse.
179  "Scaling on certainty diff from Hingepoint",
180  this->params()),
181  INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob",
182  this->params()),
183  INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word",
184  this->params()),
185  INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
186  BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk",
187  this->params()),
188  STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation",
189  this->params()),
190  STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation",
191  this->params()),
192  STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation",
193  this->params()),
195  "good_quality_doc lte rejection limit", this->params()),
197  "good_quality_doc gte good blobs limit", this->params()),
199  "good_quality_doc lte outline error limit", this->params()),
201  "good_quality_doc gte good char limit", this->params()),
202  INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word",
203  this->params()),
205  "Adaptation decision algorithm for tess", this->params()),
207  "Do minimal rejection on pass 1 output", this->params()),
208  BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria",
209  this->params()),
210  BOOL_MEMBER(tessedit_matcher_log, false, "Log matcher activity",
211  this->params()),
213  "Adaptation decision algorithm for tess", this->params()),
214  BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
215  double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
216  double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
217  INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.",
218  this->params()),
219  INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
220  this->params()),
222  "Run paragraph detection on the post-text-recognition "
223  "(more accurate)",
224  this->params()),
226  "Use ratings matrix/beam search with lstm", this->params()),
227  STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
228  this->params()),
229  STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines",
230  this->params()),
232  "Allow outline errs in unrejection?", this->params()),
234  "Reduce rejection on good docs", this->params()),
235  BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?",
236  this->params()),
238  "%rej allowed before rej whole doc", this->params()),
240  "%rej allowed before rej whole block", this->params()),
242  "%rej allowed before rej whole row", this->params()),
244  "Number of row rejects in whole word rejects"
245  " which prevents whole row rejection",
246  this->params()),
248  "Only rej partially rejected words in block rejection",
249  this->params()),
251  "Only rej partially rejected words in row rejection",
252  this->params()),
254  "Use word segmentation quality metric", this->params()),
256  "Use word segmentation quality metric", this->params()),
258  "Only preserve wds longer than this", this->params()),
260  "Apply row rejection to good docs", this->params()),
262  "rej good doc wd if more than this fraction rejected",
263  this->params()),
265  "Reject all bad quality wds", this->params()),
266  BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats",
267  this->params()),
269  "Output data to debug file", this->params()),
270  BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks",
271  this->params()),
273  "good_quality_doc gte good char limit", this->params()),
275  "Mark v.bad words for tilde crunch", this->params()),
276  BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
277  this->params()),
278  BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
279  this->params()),
281  "Take out ~^ early?", this->params()),
282  double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this",
283  this->params()),
284  BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
286  "crunch garbage cert lt this", this->params()),
288  "crunch garbage rating lt this", this->params()),
289  double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this",
290  this->params()),
291  double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this",
292  this->params()),
293  BOOL_MEMBER(crunch_pot_garbage, true, "POTENTIAL crunch garbage",
294  this->params()),
295  double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this",
296  this->params()),
297  double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this",
298  this->params()),
299  double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this",
300  this->params()),
301  double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this",
302  this->params()),
304  "Del if word width lt xht x this", this->params()),
306  "Del if word gt xht x this above bl", this->params()),
308  "Del if word gt xht x this below bl", this->params()),
309  double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this",
310  this->params()),
311  INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch",
312  this->params()),
314  "How many potential indicators needed", this->params()),
315  BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
316  this->params()),
317  BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
318  this->params()),
320  "Don't pot crunch sensible strings", this->params()),
321  BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
322  this->params()),
324  "Don't crunch words with long lower case strings",
325  this->params()),
327  "Don't crunch words with long lower case strings",
328  this->params()),
330  "Crunch words with long repetitions", this->params()),
331  INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
333  "How many non-noise blbs either side?", this->params()),
334  double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this",
335  this->params()),
337  "Reward punctuation joins", this->params()),
338  INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing",
339  this->params()),
340  INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug",
341  this->params()),
343  "Punct. chs expected WITHIN numbers", this->params()),
345  "Max allowed deviation of blob top outside of font data",
346  this->params()),
348  "Min change in xht before actually trying it", this->params()),
350  "Debug level for sub & superscript fixer", this->params()),
353  "How many times worse "
354  "certainty does a superscript position glyph need to be for "
355  "us to try classifying it as a char with a different "
356  "baseline?",
357  this->params()),
360  "What reduction in "
361  "badness do we think sufficient to choose a superscript "
362  "over what we'd thought. For example, a value of 0.6 means "
363  "we want to reduce badness of certainty by at least 40%",
364  this->params()),
366  "A superscript scaled down more than this is unbelievably "
367  "small. For example, 0.3 means we expect the font size to "
368  "be no smaller than 30% of the text line font size.",
369  this->params()),
371  "Maximum top of a character measured as a multiple of "
372  "x-height above the baseline for us to reconsider whether "
373  "it's a subscript.",
374  this->params()),
376  "Minimum bottom of a character measured as a multiple of "
377  "x-height above the baseline for us to reconsider whether "
378  "it's a superscript.",
379  this->params()),
381  "Write block separators in output", this->params()),
382  BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code",
383  this->params()),
384  BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
385  this->params()),
386  BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
387  this->params()),
388  BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
389  this->params()),
390  BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
391  this->params()),
392  BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
393  this->params()),
394  BOOL_MEMBER(textonly_pdf, false,
395  "Create PDF with only one invisible text layer",
396  this->params()),
397  INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params()),
398  INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image",
399  this->params()),
401  "Specify minimum characters to try during OSD",
402  this->params()),
404  "Output char for unidentified blobs", this->params()),
405  INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
407  "Min suspect level for rejecting spaces", this->params()),
409  "Don't suspect dict wds longer than this", this->params()),
410  BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
411  this->params()),
413  "Don't touch bad rating limit", this->params()),
414  double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
415  this->params()),
417  "Only reject tess failures", this->params()),
418  BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
419  this->params()),
421  "Make output have exactly one word per WERD", this->params()),
423  "Don't reject ANYTHING AT ALL", this->params()),
425  "Force all rep chars the same", this->params()),
426  INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
427  this->params()),
428  BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug",
429  this->params()),
430  BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips",
431  this->params()),
433  "Aspect ratio dot/hyphen test", this->params()),
435  "Aspect ratio dot/hyphen test", this->params()),
437  "Use DOC dawg in 11l conf. detector", this->params()),
438  BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
439  this->params()),
440  BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
441  this->params()),
442  BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
443  this->params()),
444  BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control",
445  this->params()),
446  BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control",
447  this->params()),
448  BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check",
449  this->params()),
450  BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check",
451  this->params()),
453  "if >this fract", this->params()),
454  INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit",
455  this->params()),
457  "Allow NN to unrej", this->params()),
458  STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set",
459  this->params()),
460  INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this",
461  this->params()),
462  BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes",
463  this->params()),
465  "-1 -> All pages"
466  " , else specific page to process",
467  this->params()),
469  "Capture the image from the IPE", this->params()),
470  BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
471  this->params()),
472  STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
473  BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
474  this->params()),
476  "List of languages to load with this one", this->params()),
478  "In multilingual mode use params model of the"
479  " primary language",
480  this->params()),
482  "Min acceptable orientation margin", this->params()),
483  BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
484  this->params()),
485  BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model",
486  this->params()),
488  "Allow feature extractors to see the original outline",
489  this->params()),
491  "Only initialize with the config file. Useful if the "
492  "instance is not going to be used for OCR but say only "
493  "for layout analysis.",
494  this->params()),
495  BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
496  this->params()),
498  "Enable vertical detection", this->params()),
500  "Force using vertical text page mode", this->params()),
503  "Fraction of textlines deemed vertical to use vertical page "
504  "mode",
505  this->params()),
508  "Fraction of height used as a minimum gap for aligned blobs.",
509  this->params()),
510  INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
511  this->params()),
513  "Preserve multiple interword spaces", this->params()),
515  "Page separator (default is form feed control character)",
516  this->params()),
518  "Allows to include alternative symbols choices in the hOCR output. "
519  "Valid input values are 0, 1 and 2. 0 is the default value. "
520  "With 1 the alternative symbol choices per timestep are included. "
521  "With 2 the alternative symbol choices are accumulated per character.",
522  this->params()),
523 
524  backup_config_file_(nullptr),
525  pix_binary_(nullptr),
526  pix_grey_(nullptr),
527  pix_original_(nullptr),
528  pix_thresholds_(nullptr),
529  source_resolution_(0),
530  textord_(this),
531  right_to_left_(false),
532  scaled_color_(nullptr),
533  scaled_factor_(-1),
534  deskew_(1.0f, 0.0f),
535  reskew_(1.0f, 0.0f),
536  most_recently_used_(this),
537  font_table_size_(0),
538  equ_detect_(nullptr),
539 #ifndef ANDROID_BUILD
540  lstm_recognizer_(nullptr),
541 #endif
542  train_line_page_num_(0) {
543 }
double superscript_bettered_certainty
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:294
char * tessedit_write_params_to_file
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:172
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:303
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:288
bool tessedit_enable_bigram_correction
double tessedit_reject_doc_percent
double tessedit_reject_block_percent
#define INT_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:300
bool crunch_early_convert_bad_unlv_chs
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:291
double tessedit_reject_row_percent
bool tessedit_resegment_from_line_boxes
bool tessedit_preserve_blk_rej_perfect_wds
bool applybox_learn_chars_and_char_frags_mode
double tessedit_good_doc_still_rowrej_wd
bool textord_tabfind_force_vertical_text
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:297
double textord_tabfind_vertical_text_ratio
double tessedit_whole_wd_rej_row_percent
#define FALSE
Definition: capi.h:52
bool tessedit_preserve_row_rej_perfect_wds
char * ok_repeated_ch_non_alphanum_wds
ParamsVectors * params()
Definition: ccutil.h:62
double rej_whole_of_mostly_reject_word_fract
double textord_tabfind_aligned_gap_fraction

◆ ~Tesseract()

tesseract::Tesseract::~Tesseract ( )

Definition at line 545 of file tesseractclass.cpp.

545  {
546  Clear();
547  pixDestroy(&pix_original_);
548  end_tesseract();
549  sub_langs_.delete_data_pointers();
550 #ifndef ANDROID_BUILD
551  delete lstm_recognizer_;
552  lstm_recognizer_ = nullptr;
553 #endif
554 }

Member Function Documentation

◆ acceptable_number_string()

bool tesseract::Tesseract::acceptable_number_string ( const char *  s,
const char *  lengths 
)

Definition at line 394 of file output.cpp.

395  {
396  bool prev_digit = false;
397 
398  if (*lengths == 1 && *s == '(')
399  s++;
400 
401  if (*lengths == 1 &&
402  ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
403  s++;
404 
405  for (; *s != '\0'; s += *(lengths++)) {
406  if (unicharset.get_isdigit(s, *lengths))
407  prev_digit = true;
408  else if (prev_digit &&
409  (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
410  prev_digit = false;
411  else if (prev_digit && *lengths == 1 &&
412  (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
413  return true;
414  else if (prev_digit &&
415  *lengths == 1 && (*s == '%') &&
416  (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
417  (*(s + *lengths + *(lengths + 1)) == '\0'))
418  return true;
419  else
420  return false;
421  }
422  return true;
423 }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
UNICHARSET unicharset
Definition: ccutil.h:68

◆ acceptable_word_string()

ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string ( const UNICHARSET char_set,
const char *  s,
const char *  lengths 
)

Definition at line 1764 of file control.cpp.

1765  {
1766  int i = 0;
1767  int offset = 0;
1768  int leading_punct_count;
1769  int upper_count = 0;
1770  int hyphen_pos = -1;
1772 
1773  if (strlen (lengths) > 20)
1774  return word_type;
1775 
1776  /* Single Leading punctuation char*/
1777 
1778  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1779  offset += lengths[i++];
1780  leading_punct_count = i;
1781 
1782  /* Initial cap */
1783  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1784  offset += lengths[i++];
1785  upper_count++;
1786  }
1787  if (upper_count > 1) {
1788  word_type = AC_UPPER_CASE;
1789  } else {
1790  /* Lower case word, possibly with an initial cap */
1791  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1792  offset += lengths[i++];
1793  }
1794  if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1795  goto not_a_word;
1796  /*
1797  Allow a single hyphen in a lower case word
1798  - don't trust upper case - I've seen several cases of "H" -> "I-I"
1799  */
1800  if (lengths[i] == 1 && s[offset] == '-') {
1801  hyphen_pos = i;
1802  offset += lengths[i++];
1803  if (s[offset] != '\0') {
1804  while ((s[offset] != '\0') &&
1805  char_set.get_islower(s + offset, lengths[i])) {
1806  offset += lengths[i++];
1807  }
1808  if (i < hyphen_pos + 3)
1809  goto not_a_word;
1810  }
1811  } else {
1812  /* Allow "'s" in NON hyphenated lower case words */
1813  if (lengths[i] == 1 && (s[offset] == '\'') &&
1814  lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1815  offset += lengths[i++];
1816  offset += lengths[i++];
1817  }
1818  }
1819  if (upper_count > 0)
1820  word_type = AC_INITIAL_CAP;
1821  else
1822  word_type = AC_LOWER_CASE;
1823  }
1824 
1825  /* Up to two different, constrained trailing punctuation chars */
1826  if (lengths[i] == 1 && s[offset] != '\0' &&
1827  STRING(chs_trailing_punct1).contains(s[offset]))
1828  offset += lengths[i++];
1829  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1830  s[offset - lengths[i - 1]] != s[offset] &&
1831  STRING(chs_trailing_punct2).contains (s[offset]))
1832  offset += lengths[i++];
1833 
1834  if (s[offset] != '\0')
1835  word_type = AC_UNACCEPTABLE;
1836 
1837  not_a_word:
1838 
1839  if (word_type == AC_UNACCEPTABLE) {
1840  /* Look for abbreviation string */
1841  i = 0;
1842  offset = 0;
1843  if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1844  word_type = AC_UC_ABBREV;
1845  while (s[offset] != '\0' &&
1846  char_set.get_isupper(s + offset, lengths[i]) &&
1847  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1848  offset += lengths[i++];
1849  offset += lengths[i++];
1850  }
1851  }
1852  else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1853  word_type = AC_LC_ABBREV;
1854  while (s[offset] != '\0' &&
1855  char_set.get_islower(s + offset, lengths[i]) &&
1856  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1857  offset += lengths[i++];
1858  offset += lengths[i++];
1859  }
1860  }
1861  if (s[offset] != '\0')
1862  word_type = AC_UNACCEPTABLE;
1863  }
1864 
1865  return word_type;
1866 }
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493
a.b.c.
Definition: control.h:34
A.B.C.
Definition: control.h:35
ALL but initial lc.
Definition: control.h:33
Definition: strngs.h:45
bool contains(const char c) const
Definition: strngs.cpp:187
ALL upper case.
Definition: control.h:32
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500
Unacceptable word.
Definition: control.h:30
ALL lower case.
Definition: control.h:31

◆ alpha_count()

int16_t tesseract::Tesseract::alpha_count ( const char *  word,
const char *  word_lengths 
)

Definition at line 500 of file reject.cpp.

501  {
502  int16_t i;
503  int16_t offset;
504  int16_t count = 0;
505 
506  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
507  if (unicharset.get_isalpha (word + offset, word_lengths[i]))
508  count++;
509  }
510  return count;
511 }
int count(LIST var_list)
Definition: oldlist.cpp:98
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
UNICHARSET unicharset
Definition: ccutil.h:68

◆ ambigs_classify_and_output()

void tesseract::Tesseract::ambigs_classify_and_output ( const char *  label,
PAGE_RES_IT pr_it,
FILE *  output_file 
)

Definition at line 209 of file recogtraining.cpp.

211  {
212  // Classify word.
213  fflush(stdout);
214  WordData word_data(*pr_it);
215  SetupWordPassN(1, &word_data);
216  classify_word_and_language(1, pr_it, &word_data);
217  WERD_RES* werd_res = word_data.word;
218  WERD_CHOICE *best_choice = werd_res->best_choice;
219  ASSERT_HOST(best_choice != nullptr);
220 
221  // Compute the number of unichars in the label.
222  GenericVector<UNICHAR_ID> encoding;
223  if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
224  tprintf("Not outputting illegal unichar %s\n", label);
225  return;
226  }
227 
228  // Dump all paths through the ratings matrix (which is normally small).
229  int dim = werd_res->ratings->dimension();
230  const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim];
231  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices,
232  unicharset, label, output_file);
233  delete [] blob_choices;
234 }
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:258
UNICHARSET unicharset
Definition: ccutil.h:68
int dimension() const
Definition: matrix.h:533
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:182
MATRIX * ratings
Definition: pageres.h:231
WERD_CHOICE * best_choice
Definition: pageres.h:235
#define ASSERT_HOST(x)
Definition: errcode.h:84
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1338
WERD * word
Definition: pageres.h:189

◆ AnyLSTMLang()

bool tesseract::Tesseract::AnyLSTMLang ( ) const
inline

Definition at line 288 of file tesseractclass.h.

288  {
289  if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) return true;
290  for (int i = 0; i < sub_langs_.size(); ++i) {
291  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY)
292  return true;
293  }
294  return false;
295  }

◆ AnyTessLang()

bool tesseract::Tesseract::AnyTessLang ( ) const
inline

Definition at line 280 of file tesseractclass.h.

280  {
281  if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true;
282  for (int i = 0; i < sub_langs_.size(); ++i) {
283  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true;
284  }
285  return false;
286  }

◆ ApplyBoxes()

PAGE_RES* tesseract::Tesseract::ApplyBoxes ( const STRING fname,
bool  find_segmentation,
BLOCK_LIST *  block_list 
)

◆ ApplyBoxTraining()

void tesseract::Tesseract::ApplyBoxTraining ( const STRING fontname,
PAGE_RES page_res 
)

◆ AssignDiacriticsToNewBlobs()

void tesseract::Tesseract::AssignDiacriticsToNewBlobs ( const GenericVector< C_OUTLINE *> &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< C_BLOB *> *  target_blobs 
)

Definition at line 1074 of file control.cpp.

1077  {
1078 #ifndef DISABLED_LEGACY_ENGINE
1079  GenericVector<bool> blob_wanted;
1080  word_wanted->init_to_size(outlines.size(), false);
1081  target_blobs->init_to_size(outlines.size(), nullptr);
1082  // Check for outlines that need to be turned into stand-alone blobs.
1083  for (int i = 0; i < outlines.size(); ++i) {
1084  if (outlines[i] == nullptr) continue;
1085  // Get a set of adjacent outlines that don't overlap any existing blob.
1086  blob_wanted.init_to_size(outlines.size(), false);
1087  int num_blob_outlines = 0;
1088  TBOX total_ol_box(outlines[i]->bounding_box());
1089  while (i < outlines.size() && outlines[i] != nullptr) {
1090  blob_wanted[i] = true;
1091  total_ol_box += outlines[i]->bounding_box();
1092  ++i;
1093  ++num_blob_outlines;
1094  }
1095  // Find the insertion point.
1096  C_BLOB_IT blob_it(real_word->cblob_list());
1097  while (!blob_it.at_last() &&
1098  blob_it.data_relative(1)->bounding_box().left() <=
1099  total_ol_box.left()) {
1100  blob_it.forward();
1101  }
1102  // Choose which combination of them we actually want and where to put
1103  // them.
1104  if (debug_noise_removal)
1105  tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1106  C_BLOB* left_blob = blob_it.data();
1107  TBOX left_box = left_blob->bounding_box();
1108  C_BLOB* right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1109  if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1110  !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1111  SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
1112  outlines, num_blob_outlines,
1113  &blob_wanted)) {
1114  if (debug_noise_removal) tprintf("Added to left blob\n");
1115  for (int j = 0; j < blob_wanted.size(); ++j) {
1116  if (blob_wanted[j]) {
1117  (*word_wanted)[j] = true;
1118  (*target_blobs)[j] = left_blob;
1119  }
1120  }
1121  } else if (right_blob != nullptr &&
1122  (!left_box.x_overlap(total_ol_box) ||
1123  right_blob->bounding_box().x_overlap(total_ol_box)) &&
1125  right_blob, outlines,
1126  num_blob_outlines, &blob_wanted)) {
1127  if (debug_noise_removal) tprintf("Added to right blob\n");
1128  for (int j = 0; j < blob_wanted.size(); ++j) {
1129  if (blob_wanted[j]) {
1130  (*word_wanted)[j] = true;
1131  (*target_blobs)[j] = right_blob;
1132  }
1133  }
1134  } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr,
1135  outlines, num_blob_outlines,
1136  &blob_wanted)) {
1137  if (debug_noise_removal) tprintf("Fitted between blobs\n");
1138  for (int j = 0; j < blob_wanted.size(); ++j) {
1139  if (blob_wanted[j]) {
1140  (*word_wanted)[j] = true;
1141  (*target_blobs)[j] = nullptr;
1142  }
1143  }
1144  }
1145  }
1146 #endif // ndef DISABLED_LEGACY_ENGINE
1147 }
int size() const
Definition: genericvector.h:71
bool x_overlap(const TBOX &box) const
Definition: rect.h:401
Definition: rect.h:34
void init_to_size(int size, const T &t)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1152
TBOX bounding_box() const
Definition: stepblob.cpp:255

◆ AssignDiacriticsToOverlappingBlobs()

void tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs ( const GenericVector< C_OUTLINE *> &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< bool > *  overlapped_any_blob,
GenericVector< C_BLOB *> *  target_blobs 
)

Definition at line 1019 of file control.cpp.

1023  {
1024 #ifndef DISABLED_LEGACY_ENGINE
1025  GenericVector<bool> blob_wanted;
1026  word_wanted->init_to_size(outlines.size(), false);
1027  overlapped_any_blob->init_to_size(outlines.size(), false);
1028  target_blobs->init_to_size(outlines.size(), nullptr);
1029  // For each real blob, find the outlines that seriously overlap it.
1030  // A single blob could be several merged characters, so there can be quite
1031  // a few outlines overlapping, and the full engine needs to be used to chop
1032  // and join to get a sensible result.
1033  C_BLOB_IT blob_it(real_word->cblob_list());
1034  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1035  C_BLOB* blob = blob_it.data();
1036  const TBOX blob_box = blob->bounding_box();
1037  blob_wanted.init_to_size(outlines.size(), false);
1038  int num_blob_outlines = 0;
1039  for (int i = 0; i < outlines.size(); ++i) {
1040  if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
1041  !(*word_wanted)[i]) {
1042  blob_wanted[i] = true;
1043  (*overlapped_any_blob)[i] = true;
1044  ++num_blob_outlines;
1045  }
1046  }
1047  if (debug_noise_removal) {
1048  tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1049  blob_box.print();
1050  }
1051  // If any outlines overlap the blob, and not too many, classify the blob
1052  // (using the full engine, languages and all), and choose the maximal
1053  // combination of outlines that doesn't hurt the end-result classification
1054  // by too much. Mark them as wanted.
1055  if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1056  if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
1057  outlines, num_blob_outlines,
1058  &blob_wanted)) {
1059  for (int i = 0; i < blob_wanted.size(); ++i) {
1060  if (blob_wanted[i]) {
1061  // Claim the outline and record where it is going.
1062  (*word_wanted)[i] = true;
1063  (*target_blobs)[i] = blob;
1064  }
1065  }
1066  }
1067  }
1068  }
1069 #endif // ndef DISABLED_LEGACY_ENGINE
1070 }
int size() const
Definition: genericvector.h:71
void print() const
Definition: rect.h:278
Definition: rect.h:34
void init_to_size(int size, const T &t)
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1152
TBOX bounding_box() const
Definition: stepblob.cpp:255

◆ AutoPageSeg()

int tesseract::Tesseract::AutoPageSeg ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks,
BLOBNBOX_LIST *  diacritic_blobs,
Tesseract osd_tess,
OSResults osr 
)

Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.

Resolution (in ppi) is derived from the input image.

The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.

If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.

If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout analysis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.

If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).

Definition at line 201 of file pagesegmain.cpp.

204  {
205  Pix* photomask_pix = nullptr;
206  Pix* musicmask_pix = nullptr;
207  // The blocks made by the ColumnFinder. Moved to blocks before return.
208  BLOCK_LIST found_blocks;
209  TO_BLOCK_LIST temp_blocks;
210 
211  ColumnFinder* finder = SetupPageSegAndDetectOrientation(
212  pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
213  &musicmask_pix);
214  int result = 0;
215  if (finder != nullptr) {
216  TO_BLOCK_IT to_block_it(&temp_blocks);
217  TO_BLOCK* to_block = to_block_it.data();
218  if (musicmask_pix != nullptr) {
219  // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
220  // blocks separately. For now combine with photomask_pix.
221  pixOr(photomask_pix, photomask_pix, musicmask_pix);
222  }
223  if (equ_detect_) {
224  finder->SetEquationDetect(equ_detect_);
225  }
226  result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
227  to_block, photomask_pix, pix_thresholds_,
228  pix_grey_, &pixa_debug_, &found_blocks,
229  diacritic_blobs, to_blocks);
230  if (result >= 0)
231  finder->GetDeskewVectors(&deskew_, &reskew_);
232  delete finder;
233  }
234  pixDestroy(&photomask_pix);
235  pixDestroy(&musicmask_pix);
236  if (result < 0) return result;
237 
238  blocks->clear();
239  BLOCK_IT block_it(blocks);
240  // Move the found blocks to the input/output blocks.
241  block_it.add_list_after(&found_blocks);
242  return result;
243 }
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)

◆ BelievableSuperscript()

bool tesseract::Tesseract::BelievableSuperscript ( bool  debug,
const WERD_RES word,
float  certainty_threshold,
int *  left_ok,
int *  right_ok 
) const

Return whether this is believable superscript or subscript text.

We insist that:

  • there are no punctuation marks.
  • there are no italics.
  • no normal-sized character is smaller than superscript_scaledown_ratio of what it ought to be, and
  • each character is at least as certain as certainty_threshold.
Parameters
[in]debugIf true, spew debug output
[in]wordThe word whose best_choice we're evaluating
[in]certainty_thresholdIf any of the characters have less certainty than this, reject.
[out]left_okHow many left-side characters were ok?
[out]right_okHow many right-side characters were ok?
Returns
Whether the complete best choice is believable as a superscript.

Definition at line 522 of file superscript.cpp.

526  {
527  int initial_ok_run_count = 0;
528  int ok_run_count = 0;
529  float worst_certainty = 0.0f;
530  const WERD_CHOICE &wc = *word.best_choice;
531 
532  const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
533  for (int i = 0; i < wc.length(); i++) {
534  TBLOB *blob = word.rebuild_word->blobs[i];
535  UNICHAR_ID unichar_id = wc.unichar_id(i);
536  float char_certainty = wc.certainty(i);
537  bool bad_certainty = char_certainty < certainty_threshold;
538  bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
539  bool is_italic = word.fontinfo && word.fontinfo->is_italic();
540  BLOB_CHOICE *choice = word.GetBlobChoice(i);
541  if (choice && fontinfo_table.size() > 0) {
542  // Get better information from the specific choice, if available.
543  int font_id1 = choice->fontinfo_id();
544  bool font1_is_italic = font_id1 >= 0
545  ? fontinfo_table.get(font_id1).is_italic() : false;
546  int font_id2 = choice->fontinfo_id2();
547  is_italic = font1_is_italic &&
548  (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
549  }
550 
551  float height_fraction = 1.0f;
552  float char_height = blob->bounding_box().height();
553  float normal_height = char_height;
554  if (wc.unicharset()->top_bottom_useful()) {
555  int min_bot, max_bot, min_top, max_top;
556  wc.unicharset()->get_top_bottom(unichar_id,
557  &min_bot, &max_bot,
558  &min_top, &max_top);
559  float hi_height = max_top - max_bot;
560  float lo_height = min_top - min_bot;
561  normal_height = (hi_height + lo_height) / 2;
562  if (normal_height >= kBlnXHeight) {
563  // Only ding characters that we have decent information for because
564  // they're supposed to be normal sized, not tiny specks or dashes.
565  height_fraction = char_height / normal_height;
566  }
567  }
568  bool bad_height = height_fraction < superscript_scaledown_ratio;
569 
570  if (debug) {
571  if (is_italic) {
572  tprintf(" Rejecting: superscript is italic.\n");
573  }
574  if (is_punc) {
575  tprintf(" Rejecting: punctuation present.\n");
576  }
577  const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
578  if (bad_certainty) {
579  tprintf(" Rejecting: don't believe character %s with certainty %.2f "
580  "which is less than threshold %.2f\n", char_str,
581  char_certainty, certainty_threshold);
582  }
583  if (bad_height) {
584  tprintf(" Rejecting: character %s seems too small @ %.2f versus "
585  "expected %.2f\n", char_str, char_height, normal_height);
586  }
587  }
588  if (bad_certainty || bad_height || is_punc || is_italic) {
589  if (ok_run_count == i) {
590  initial_ok_run_count = ok_run_count;
591  }
592  ok_run_count = 0;
593  } else {
594  ok_run_count++;
595  }
596  if (char_certainty < worst_certainty) {
597  worst_certainty = char_certainty;
598  }
599  }
600  bool all_ok = ok_run_count == wc.length();
601  if (all_ok && debug) {
602  tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
603  }
604  if (!all_ok) {
605  if (left_ok) *left_ok = initial_ok_run_count;
606  if (right_ok) *right_ok = ok_run_count;
607  }
608  return all_ok;
609 }
TWERD * rebuild_word
Definition: pageres.h:260
int UNICHAR_ID
Definition: unichar.h:35
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:514
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
const int kBlnXHeight
Definition: normalis.h:24
const FontInfo * fontinfo
Definition: pageres.h:304
float certainty() const
Definition: ratngs.h:330
int16_t fontinfo_id() const
Definition: ratngs.h:86
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:563
int16_t fontinfo_id2() const
Definition: ratngs.h:89
bool is_italic() const
Definition: fontinfo.h:111
int size() const
Return the size used.
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
TBOX bounding_box() const
Definition: blobs.cpp:478
int length() const
Definition: ratngs.h:303
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
const T & get(int id) const
Return the object from an id.
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
Definition: blobs.h:268
bool top_bottom_useful() const
Definition: unicharset.h:532
WERD_CHOICE * best_choice
Definition: pageres.h:235
int16_t height() const
Definition: rect.h:108
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:756

◆ BestPix()

Pix* tesseract::Tesseract::BestPix ( ) const
inline

Definition at line 229 of file tesseractclass.h.

229  {
230  if (pixGetWidth(pix_original_) == ImageWidth())
231  return pix_original_;
232  else if (pix_grey_ != nullptr)
233  return pix_grey_;
234  else
235  return pix_binary_;
236  }

◆ bigram_correction_pass()

void tesseract::Tesseract::bigram_correction_pass ( PAGE_RES page_res)

Definition at line 473 of file control.cpp.

473  {
474  PAGE_RES_IT word_it(page_res);
475 
476  WERD_RES *w_prev = nullptr;
477  WERD_RES *w = word_it.word();
478  while (true) {
479  w_prev = w;
480  while (word_it.forward() != nullptr &&
481  (!word_it.word() || word_it.word()->part_of_combo)) {
482  // advance word_it, skipping over parts of combos
483  }
484  if (!word_it.word()) break;
485  w = word_it.word();
486  if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
487  continue;
488  }
489  if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
490  if (tessedit_bigram_debug) {
491  tprintf("Skipping because one of the words is W_REP_CHAR\n");
492  }
493  continue;
494  }
495  // Two words sharing the same language model, excellent!
496  GenericVector<WERD_CHOICE *> overrides_word1;
497  GenericVector<WERD_CHOICE *> overrides_word2;
498 
499  const STRING orig_w1_str = w_prev->best_choice->unichar_string();
500  const STRING orig_w2_str = w->best_choice->unichar_string();
501  WERD_CHOICE prev_best(w->uch_set);
502  {
503  int w1start, w1end;
504  w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
505  prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
506  }
507  WERD_CHOICE this_best(w->uch_set);
508  {
509  int w2start, w2end;
510  w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
511  this_best = w->best_choice->shallow_copy(w2start, w2end);
512  }
513 
514  if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
515  if (tessedit_bigram_debug) {
516  tprintf("Top choice \"%s %s\" verified by bigram model.\n",
517  orig_w1_str.string(), orig_w2_str.string());
518  }
519  continue;
520  }
521  if (tessedit_bigram_debug > 2) {
522  tprintf("Examining alt choices for \"%s %s\".\n",
523  orig_w1_str.string(), orig_w2_str.string());
524  }
525  if (tessedit_bigram_debug > 1) {
526  if (!w_prev->best_choices.singleton()) {
527  w_prev->PrintBestChoices();
528  }
529  if (!w->best_choices.singleton()) {
530  w->PrintBestChoices();
531  }
532  }
533  float best_rating = 0.0;
534  int best_idx = 0;
535  WERD_CHOICE_IT prev_it(&w_prev->best_choices);
536  for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
537  WERD_CHOICE *p1 = prev_it.data();
538  WERD_CHOICE strip1(w->uch_set);
539  {
540  int p1start, p1end;
541  p1->GetNonSuperscriptSpan(&p1start, &p1end);
542  strip1 = p1->shallow_copy(p1start, p1end);
543  }
544  WERD_CHOICE_IT w_it(&w->best_choices);
545  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
546  WERD_CHOICE *p2 = w_it.data();
547  WERD_CHOICE strip2(w->uch_set);
548  {
549  int p2start, p2end;
550  p2->GetNonSuperscriptSpan(&p2start, &p2end);
551  strip2 = p2->shallow_copy(p2start, p2end);
552  }
553  if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
554  overrides_word1.push_back(p1);
555  overrides_word2.push_back(p2);
556  if (overrides_word1.size() == 1 ||
557  p1->rating() + p2->rating() < best_rating) {
558  best_rating = p1->rating() + p2->rating();
559  best_idx = overrides_word1.size() - 1;
560  }
561  }
562  }
563  }
564  if (!overrides_word1.empty()) {
565  // Excellent, we have some bigram matches.
567  *overrides_word1[best_idx]) &&
569  *overrides_word2[best_idx])) {
570  if (tessedit_bigram_debug > 1) {
571  tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
572  "model.\n", orig_w1_str.string(), orig_w2_str.string());
573  }
574  continue;
575  }
576  const STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
577  const STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
578  if (new_w1_str != orig_w1_str) {
579  w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
580  }
581  if (new_w2_str != orig_w2_str) {
582  w->ReplaceBestChoice(overrides_word2[best_idx]);
583  }
584  if (tessedit_bigram_debug > 0) {
585  STRING choices_description;
586  int num_bigram_choices
587  = overrides_word1.size() * overrides_word2.size();
588  if (num_bigram_choices == 1) {
589  choices_description = "This was the unique bigram choice.";
590  } else {
591  if (tessedit_bigram_debug > 1) {
592  STRING bigrams_list;
593  const int kMaxChoicesToPrint = 20;
594  for (int i = 0; i < overrides_word1.size() &&
595  i < kMaxChoicesToPrint; i++) {
596  if (i > 0) { bigrams_list += ", "; }
597  WERD_CHOICE *p1 = overrides_word1[i];
598  WERD_CHOICE *p2 = overrides_word2[i];
599  bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
600  }
601  choices_description = "There were many choices: {";
602  choices_description += bigrams_list;
603  choices_description += "}";
604  } else {
605  choices_description.add_str_int("There were ", num_bigram_choices);
606  choices_description += " compatible bigrams.";
607  }
608  }
609  tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
610  orig_w1_str.string(), orig_w2_str.string(),
611  new_w1_str.string(), new_w2_str.string(),
612  choices_description.string());
613  }
614  }
615  }
616 }
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
int size() const
Definition: genericvector.h:71
Dict & getDict() override
const char * string() const
Definition: strngs.cpp:196
float rating() const
Definition: ratngs.h:327
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:787
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:414
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:397
void PrintBestChoices() const
Definition: pageres.cpp:723
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:808
int push_back(T object)
void add_str_int(const char *str, int number)
Definition: strngs.cpp:379
tesseract::Tesseract * tesseract
Definition: pageres.h:282
Definition: strngs.h:45
const UNICHARSET * uch_set
Definition: pageres.h:206
const STRING & unichar_string() const
Definition: ratngs.h:541
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:801
WERD_CHOICE * best_choice
Definition: pageres.h:235
WERD * word
Definition: pageres.h:189

◆ blamer_pass()

void tesseract::Tesseract::blamer_pass ( PAGE_RES page_res)

Definition at line 716 of file control.cpp.

716  {
717  if (!wordrec_run_blamer) return;
718  PAGE_RES_IT page_res_it(page_res);
719  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
720  page_res_it.forward()) {
721  WERD_RES *word = page_res_it.word();
724  }
725  tprintf("Blame reasons:\n");
726  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
728  static_cast<IncorrectResultReason>(bl)),
729  page_res->blame_reasons[bl]);
730  }
731  if (page_res->misadaption_log.length() > 0) {
732  tprintf("Misadaption log:\n");
733  for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
734  tprintf("%s\n", page_res->misadaption_log[i].string());
735  }
736  }
737 }
GenericVector< STRING > misadaption_log
Definition: pageres.h:92
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:61
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:118
int length() const
Definition: genericvector.h:85
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
GenericVector< int > blame_reasons
Definition: pageres.h:87
BlamerBundle * blamer_bundle
Definition: pageres.h:246
bool wordrec_run_blamer
Definition: wordrec.h:237
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:552
bool wordrec_debug_blamer
Definition: wordrec.h:236
WERD * word
Definition: pageres.h:189

◆ blob_feature_display()

void tesseract::Tesseract::blob_feature_display ( PAGE_RES page_res,
const TBOX selection_box 
)

Definition at line 959 of file pgedit.cpp.

960  {
961 #ifndef DISABLED_LEGACY_ENGINE
962  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
963  if (it != nullptr) {
964  WERD_RES* word_res = it->word();
965  word_res->x_height = it->row()->row->x_height();
966  word_res->SetupForRecognition(unicharset, this, BestPix(),
967  tessedit_ocr_engine_mode, nullptr,
971  it->row()->row, it->block()->block);
972  TWERD* bln_word = word_res->chopped_word;
973  TBLOB* bln_blob = bln_word->blobs[0];
974  INT_FX_RESULT_STRUCT fx_info;
977  Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features,
978  &cn_features, &fx_info, nullptr);
979  // Display baseline features.
980  ScrollView* bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
982  for (int f = 0; f < bl_features.size(); ++f)
983  RenderIntFeature(bl_win, &bl_features[f], ScrollView::GREEN);
984  bl_win->Update();
985  // Display cn features.
986  ScrollView* cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
988  for (int f = 0; f < cn_features.size(); ++f)
989  RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN);
990  cn_win->Update();
991 
992  it->DeleteCurrentWord();
993  delete it;
994  }
995 #endif // ndef DISABLED_LEGACY_ENGINE
996 }
BLOCK_RES * block() const
Definition: pageres.h:757
int size() const
Definition: genericvector.h:71
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:998
Definition: blobs.h:402
ROW_RES * row() const
Definition: pageres.h:754
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
Definition: intfx.cpp:444
bool classify_bln_numeric_mode
Definition: classify.h:541
static void Update()
Definition: scrollview.cpp:711
Pix * BestPix() const
bool classify_nonlinear_norm
Definition: classify.h:457
BLOCK * block
Definition: pageres.h:117
float x_height() const
Definition: ocrrow.h:64
UNICHARSET unicharset
Definition: ccutil.h:68
WERD_RES * word() const
Definition: pageres.h:751
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:308
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1628
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
float x_height
Definition: pageres.h:311
void DeleteCurrentWord()
Definition: pageres.cpp:1450
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1789
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:35
Definition: blobs.h:268
TWERD * chopped_word
Definition: pageres.h:215
ROW * row
Definition: pageres.h:143

◆ blob_noise_score()

float tesseract::Tesseract::blob_noise_score ( TBLOB blob)

Definition at line 790 of file fixspace.cpp.

790  {
791  TBOX box; // BB of outline
792  int16_t outline_count = 0;
793  int16_t max_dimension;
794  int16_t largest_outline_dimension = 0;
795 
796  for (TESSLINE* ol = blob->outlines; ol != nullptr; ol= ol->next) {
797  outline_count++;
798  box = ol->bounding_box();
799  if (box.height() > box.width()) {
800  max_dimension = box.height();
801  } else {
802  max_dimension = box.width();
803  }
804 
805  if (largest_outline_dimension < max_dimension)
806  largest_outline_dimension = max_dimension;
807  }
808 
809  if (outline_count > 5) {
810  // penalise LOTS of blobs
811  largest_outline_dimension *= 2;
812  }
813 
814  box = blob->bounding_box();
815  if (box.bottom() > kBlnBaselineOffset * 4 ||
816  box.top() < kBlnBaselineOffset / 2) {
817  // Lax blob is if high or low
818  largest_outline_dimension /= 2;
819  }
820 
821  return largest_outline_dimension;
822 }
TESSLINE * next
Definition: blobs.h:265
Definition: rect.h:34
const int kBlnBaselineOffset
Definition: normalis.h:25
int16_t width() const
Definition: rect.h:115
int16_t top() const
Definition: rect.h:58
TBOX bounding_box() const
Definition: blobs.cpp:478
int16_t bottom() const
Definition: rect.h:65
TESSLINE * outlines
Definition: blobs.h:384
int16_t height() const
Definition: rect.h:108

◆ break_noisiest_blob_word()

void tesseract::Tesseract::break_noisiest_blob_word ( WERD_RES_LIST &  words)

break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.

Definition at line 645 of file fixspace.cpp.

645  {
646  WERD_RES_IT word_it(&words);
647  WERD_RES_IT worst_word_it;
648  float worst_noise_score = 9999;
649  int worst_blob_index = -1; // Noisiest blob of noisiest wd
650  int blob_index; // of wds noisiest blob
651  float noise_score; // of wds noisiest blob
652  WERD_RES *word_res;
653  C_BLOB_IT blob_it;
654  C_BLOB_IT rej_cblob_it;
655  C_BLOB_LIST new_blob_list;
656  C_BLOB_IT new_blob_it;
657  C_BLOB_IT new_rej_cblob_it;
658  WERD *new_word;
659  int16_t start_of_noise_blob;
660  int16_t i;
661 
662  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
663  blob_index = worst_noise_blob(word_it.data(), &noise_score);
664  if (blob_index > -1 && worst_noise_score > noise_score) {
665  worst_noise_score = noise_score;
666  worst_blob_index = blob_index;
667  worst_word_it = word_it;
668  }
669  }
670  if (worst_blob_index < 0) {
671  words.clear(); // signal termination
672  return;
673  }
674 
675  /* Now split the worst_word_it */
676 
677  word_res = worst_word_it.data();
678 
679  /* Move blobs before noise blob to a new bloblist */
680 
681  new_blob_it.set_to_list(&new_blob_list);
682  blob_it.set_to_list(word_res->word->cblob_list());
683  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
684  new_blob_it.add_after_then_move(blob_it.extract());
685  }
686  start_of_noise_blob = blob_it.data()->bounding_box().left();
687  delete blob_it.extract(); // throw out noise blob
688 
689  new_word = new WERD(&new_blob_list, word_res->word);
690  new_word->set_flag(W_EOL, FALSE);
691  word_res->word->set_flag(W_BOL, FALSE);
692  word_res->word->set_blanks(1); // After break
693 
694  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
695  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
696  for (;
697  (!rej_cblob_it.empty() &&
698  (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
699  rej_cblob_it.forward()) {
700  new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
701  }
702 
703  WERD_RES* new_word_res = new WERD_RES(new_word);
704  new_word_res->combination = true;
705  worst_word_it.add_before_then_move(new_word_res);
706 
707  word_res->ClearResults();
708 }
Definition: werd.h:35
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:127
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:93
#define FALSE
Definition: capi.h:52
void set_blanks(uint8_t new_blanks)
Definition: werd.h:105
Definition: werd.h:59
Definition: werd.h:34
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
bool combination
Definition: pageres.h:334
void ClearResults()
Definition: pageres.cpp:1153
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:710
WERD * word
Definition: pageres.h:189

◆ build_menu_new()

SVMenuNode * tesseract::Tesseract::build_menu_new ( )

Definition at line 247 of file pgedit.cpp.

247  {
248  SVMenuNode* parent_menu;
249  SVMenuNode* root_menu_item = new SVMenuNode();
250 
251  SVMenuNode* modes_menu_item = root_menu_item->AddChild("MODES");
252 
253  modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
254  modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
255  modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
256  modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
257  modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
258  modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
259  modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
260  modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
261 
262  parent_menu = root_menu_item->AddChild("DISPLAY");
263 
264  parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, FALSE);
265  parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, FALSE);
266  parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, FALSE);
267  parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, FALSE);
268  parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, FALSE);
269  parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, TRUE);
270  parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
271  parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
272  parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
273  parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
274  parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
275  parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
276  parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
277  parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
278  parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
279 
280 
281  parent_menu = root_menu_item->AddChild("OTHER");
282 
283  parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
284  parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, FALSE);
285  parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, FALSE);
286  parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, FALSE);
287  parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
288  parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
289 
290  return root_menu_item;
291 }
#define TRUE
Definition: capi.h:51
#define FALSE
Definition: capi.h:52
SVMenuNode * AddChild(const char *txt)
Definition: svmnode.cpp:58

◆ check_debug_pt()

bool tesseract::Tesseract::check_debug_pt ( WERD_RES word,
int  location 
)

Definition at line 1868 of file control.cpp.

1868  {
1869  bool show_map_detail = false;
1870  int16_t i;
1871 
1872  if (!test_pt)
1873  return false;
1874 
1875  tessedit_rejection_debug.set_value (FALSE);
1876  debug_x_ht_level.set_value(0);
1877 
1878  if (word->word->bounding_box().contains(FCOORD (test_pt_x, test_pt_y))) {
1879  if (location < 0)
1880  return true; // For breakpoint use
1881  tessedit_rejection_debug.set_value(TRUE);
1882  debug_x_ht_level.set_value(2);
1883  tprintf ("\n\nTESTWD::");
1884  switch (location) {
1885  case 0:
1886  tprintf ("classify_word_pass1 start\n");
1887  word->word->print();
1888  break;
1889  case 10:
1890  tprintf ("make_reject_map: initial map");
1891  break;
1892  case 20:
1893  tprintf ("make_reject_map: after NN");
1894  break;
1895  case 30:
1896  tprintf ("classify_word_pass2 - START");
1897  break;
1898  case 40:
1899  tprintf ("classify_word_pass2 - Pre Xht");
1900  break;
1901  case 50:
1902  tprintf ("classify_word_pass2 - END");
1903  show_map_detail = true;
1904  break;
1905  case 60:
1906  tprintf ("fixspace");
1907  break;
1908  case 70:
1909  tprintf ("MM pass START");
1910  break;
1911  case 80:
1912  tprintf ("MM pass END");
1913  break;
1914  case 90:
1915  tprintf ("After Poor quality rejection");
1916  break;
1917  case 100:
1918  tprintf ("unrej_good_quality_words - START");
1919  break;
1920  case 110:
1921  tprintf ("unrej_good_quality_words - END");
1922  break;
1923  case 120:
1924  tprintf ("Write results pass");
1925  show_map_detail = true;
1926  break;
1927  }
1928  if (word->best_choice != nullptr) {
1929  tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
1930  word->reject_map.print(debug_fp);
1931  tprintf("\n");
1932  if (show_map_detail) {
1933  tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
1934  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1935  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1936  word->reject_map[i].full_print(debug_fp);
1937  }
1938  }
1939  } else {
1940  tprintf("null best choice\n");
1941  }
1942  tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1943  tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1944  return true;
1945  } else {
1946  return false;
1947  }
1948 }
#define TRUE
Definition: capi.h:51
FILE * debug_fp
Definition: tessvars.cpp:24
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
void full_print(FILE *fp)
Definition: rejctmap.cpp:335
TBOX bounding_box() const
Definition: werd.cpp:159
void print()
Definition: werd.cpp:265
#define FALSE
Definition: capi.h:52
bool tess_accepted
Definition: pageres.h:296
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool done
Definition: pageres.h:298
bool contains(const FCOORD pt) const
Definition: rect.h:333
Definition: points.h:189
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_CHOICE * best_choice
Definition: pageres.h:235
void print(FILE *fp)
Definition: rejctmap.cpp:323
WERD * word
Definition: pageres.h:189

◆ classify_word_and_language()

void tesseract::Tesseract::classify_word_and_language ( int  pass_n,
PAGE_RES_IT pr_it,
WordData word_data 
)

Definition at line 1338 of file control.cpp.

1339  {
1340 #ifdef DISABLED_LEGACY_ENGINE
1342 #else
1343  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
1345 #endif // def DISABLED_LEGACY_ENGINE
1346 
1347  // Best result so far.
1348  PointerVector<WERD_RES> best_words;
1349  // Points to the best result. May be word or in lang_words.
1350  const WERD_RES* word = word_data->word;
1351  clock_t start_t = clock();
1352  const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1353  if (debug) {
1354  tprintf("%s word with lang %s at:",
1355  word->done ? "Already done" : "Processing",
1356  most_recently_used_->lang.string());
1357  word->word->bounding_box().print();
1358  }
1359  if (word->done) {
1360  // If done on pass1, leave it as-is.
1361  if (!word->tess_failed)
1362  most_recently_used_ = word->tesseract;
1363  return;
1364  }
1365  int sub = sub_langs_.size();
1366  if (most_recently_used_ != this) {
1367  // Get the index of the most_recently_used_.
1368  for (sub = 0; sub < sub_langs_.size() &&
1369  most_recently_used_ != sub_langs_[sub]; ++sub) {}
1370  }
1371  most_recently_used_->RetryWithLanguage(
1372  *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
1373  Tesseract* best_lang_tess = most_recently_used_;
1374  if (!WordsAcceptable(best_words)) {
1375  // Try all the other languages to see if they are any better.
1376  if (most_recently_used_ != this &&
1377  this->RetryWithLanguage(*word_data, recognizer, debug,
1378  &word_data->lang_words[sub_langs_.size()],
1379  &best_words) > 0) {
1380  best_lang_tess = this;
1381  }
1382  for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1383  ++i) {
1384  if (most_recently_used_ != sub_langs_[i] &&
1385  sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
1386  &word_data->lang_words[i],
1387  &best_words) > 0) {
1388  best_lang_tess = sub_langs_[i];
1389  }
1390  }
1391  }
1392  most_recently_used_ = best_lang_tess;
1393  if (!best_words.empty()) {
1394  if (best_words.size() == 1 && !best_words[0]->combination) {
1395  // Move the best single result to the main word.
1396  word_data->word->ConsumeWordResults(best_words[0]);
1397  } else {
1398  // Words came from LSTM, and must be moved to the PAGE_RES properly.
1399  word_data->word = best_words.back();
1400  pr_it->ReplaceCurrentWord(&best_words);
1401  }
1402  ASSERT_HOST(word_data->word->box_word != nullptr);
1403  } else {
1404  tprintf("no best words!!\n");
1405  }
1406  clock_t ocr_t = clock();
1407  if (tessedit_timing_debug) {
1408  tprintf("%s (ocr took %.2f sec)\n",
1409  word->best_choice->unichar_string().string(),
1410  static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
1411  }
1412 }
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1322
bool tess_failed
Definition: pageres.h:288
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1591
void print() const
Definition: rect.h:278
const char * string() const
Definition: strngs.cpp:196
TBOX bounding_box() const
Definition: werd.cpp:159
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1420
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:910
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
STRING lang
Definition: ccutil.h:66
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool done
Definition: pageres.h:298
tesseract::Tesseract * tesseract
Definition: pageres.h:282
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_CHOICE * best_choice
Definition: pageres.h:235
#define ASSERT_HOST(x)
Definition: errcode.h:84
WERD * word
Definition: pageres.h:189

◆ classify_word_pass1()

void tesseract::Tesseract::classify_word_pass1 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass1

Baseline normalize the word and pass it to Tess.

Definition at line 1420 of file control.cpp.

1422  {
1423  ROW* row = word_data.row;
1424  BLOCK* block = word_data.block;
1425  prev_word_best_choice_ = word_data.prev_word != nullptr
1426  ? word_data.prev_word->word->best_choice : nullptr;
1427 #ifndef ANDROID_BUILD
1428 #ifdef DISABLED_LEGACY_ENGINE
1430 #else
1433 #endif // def DISABLED_LEGACY_ENGINE
1434  if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1435  LSTMRecognizeWord(*block, row, *in_word, out_words);
1436  if (!out_words->empty())
1437  return; // Successful lstm recognition.
1438  }
1440  // No fallback allowed, so use a fake.
1441  (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1442  return;
1443  }
1444 
1445  #ifndef DISABLED_LEGACY_ENGINE
1446  // Fall back to tesseract for failed words or odd words.
1447  (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
1448  OEM_TESSERACT_ONLY, nullptr,
1451  poly_allow_detailed_fx, row, block);
1452 #endif // ndef DISABLED_LEGACY_ENGINE
1453  }
1454 #endif // ndef ANDROID_BUILD
1455 
1456 #ifndef DISABLED_LEGACY_ENGINE
1457  WERD_RES* word = *in_word;
1458  match_word_pass_n(1, word, row, block);
1459  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1460  word->tess_would_adapt = AdaptableWord(word);
1461  bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1462 
1463  if (adapt_ok) {
1464  // Send word to adaptive classifier for training.
1465  word->BestChoiceToCorrectText();
1466  LearnWord(nullptr, word);
1467  // Mark misadaptions if running blamer.
1468  if (word->blamer_bundle != nullptr) {
1471  }
1472  }
1473 
1474  if (tessedit_enable_doc_dict && !word->IsAmbiguous())
1476  }
1477 #endif // ndef DISABLED_LEGACY_ENGINE
1478 }
const UNICHARSET & GetUnicharset() const
bool tess_failed
Definition: pageres.h:288
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:222
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:72
bool classify_bln_numeric_mode
Definition: classify.h:541
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:823
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:579
Pix * BestPix() const
bool IsAmbiguous()
Definition: pageres.cpp:458
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1649
bool tess_would_adapt
Definition: pageres.h:297
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
bool word_adaptable(WERD_RES *word, uint16_t mode)
Definition: adaptions.cpp:35
UNICHARSET unicharset
Definition: ccutil.h:68
void BestChoiceToCorrectText()
Definition: pageres.cpp:929
Definition: ocrrow.h:36
bool empty() const
Definition: genericvector.h:90
Definition: ocrblock.h:30
BlamerBundle * blamer_bundle
Definition: pageres.h:246
bool wordrec_debug_blamer
Definition: wordrec.h:236
WERD_CHOICE * best_choice
Definition: pageres.h:235
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:251
WERD * word
Definition: pageres.h:189

◆ classify_word_pass2()

void tesseract::Tesseract::classify_word_pass2 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass2

Control what to do with the word in pass 2

Definition at line 1591 of file control.cpp.

1593  {
1594  // Return if we do not want to run Tesseract.
1596  return;
1597  }
1598 #ifndef DISABLED_LEGACY_ENGINE
1599  ROW* row = word_data.row;
1600  BLOCK* block = word_data.block;
1601  WERD_RES* word = *in_word;
1602  prev_word_best_choice_ = word_data.prev_word != nullptr
1603  ? word_data.prev_word->word->best_choice : nullptr;
1604 
1606  check_debug_pt(word, 30);
1607  if (!word->done) {
1608  word->caps_height = 0.0;
1609  if (word->x_height == 0.0f)
1610  word->x_height = row->x_height();
1611  match_word_pass_n(2, word, row, block);
1612  check_debug_pt(word, 40);
1613  }
1614 
1615  SubAndSuperscriptFix(word);
1616 
1617  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1619  block->classify_rotation().y() == 0.0f) {
1620  // Use the tops and bottoms since they are available.
1621  TrainedXheightFix(word, block, row);
1622  }
1623 
1625  }
1626 #ifndef GRAPHICS_DISABLED
1628  if (fx_win == nullptr)
1629  create_fx_win();
1630  clear_fx_win();
1631  word->rebuild_word->plot(fx_win);
1632  TBOX wbox = word->rebuild_word->bounding_box();
1633  fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1634  wbox.right(), wbox.bottom());
1636  }
1637 #endif
1639  check_debug_pt(word, 50);
1640 #endif // ndef DISABLED_LEGACY_ENGINE
1641 }
bool tess_failed
Definition: pageres.h:288
TWERD * rebuild_word
Definition: pageres.h:260
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:85
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:760
void clear_fx_win()
Definition: drawfx.cpp:72
TBOX bounding_box() const
Definition: blobs.cpp:871
Definition: rect.h:34
void create_fx_win()
Definition: drawfx.cpp:59
bool script_has_xheight() const
Definition: unicharset.h:898
bool SubAndSuperscriptFix(WERD_RES *word_res)
static void Update()
Definition: scrollview.cpp:711
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1649
int16_t left() const
Definition: rect.h:72
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
int16_t top() const
Definition: rect.h:58
float x_height() const
Definition: ocrrow.h:64
UNICHARSET unicharset
Definition: ccutil.h:68
FCOORD classify_rotation() const
Definition: ocrblock.h:142
void plot(ScrollView *window)
Definition: blobs.cpp:907
Definition: ocrrow.h:36
Definition: ocrblock.h:30
float caps_height
Definition: pageres.h:312
bool done
Definition: pageres.h:298
EXTERN ScrollView * fx_win
Definition: drawfx.cpp:50
float x_height
Definition: pageres.h:311
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868
int16_t right() const
Definition: rect.h:79
#define SUBLOC_NORM
Definition: errcode.h:59
int16_t bottom() const
Definition: rect.h:65
bool top_bottom_useful() const
Definition: unicharset.h:532
float y() const
Definition: points.h:211
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1504
WERD * word
Definition: pageres.h:189

◆ ClassifyBlobAsWord()

float tesseract::Tesseract::ClassifyBlobAsWord ( int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str,
float *  c2 
)

Definition at line 1287 of file control.cpp.

1288  {
1289 #ifndef DISABLED_LEGACY_ENGINE
1290  WERD* real_word = pr_it->word()->word;
1291  WERD* word = real_word->ConstructFromSingleBlob(
1292  real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
1293  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1294  // Get a new iterator that points to the new word.
1295  PAGE_RES_IT it(pr_it->page_res);
1296  while (it.word() != word_res && it.word() != nullptr) it.forward();
1297  ASSERT_HOST(it.word() == word_res);
1298  WordData wd(it);
1299  // Force full initialization.
1300  SetupWordPassN(1, &wd);
1301  classify_word_and_language(pass_n, &it, &wd);
1302  if (debug_noise_removal) {
1303  if (wd.word->raw_choice != NULL) {
1304  tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
1305  wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1306  wd.word->raw_choice->max_x_height());
1307  } else {
1308  tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1309  wd.row->x_height());
1310  }
1311  }
1312  float cert = 0.0f;
1313  if (wd.word->raw_choice != NULL) { // This probably shouldn't happen, but...
1314  cert = wd.word->raw_choice->certainty();
1315  float rat = wd.word->raw_choice->rating();
1316  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1317  *best_str = wd.word->raw_choice->unichar_string();
1318  } else {
1319  *c2 = 0.0f;
1320  *best_str = "";
1321  }
1322  it.DeleteCurrentWord();
1323  pr_it->ResetWordIterator();
1324  return cert;
1325 #else
1326  return 0.1;
1327 #endif // ndef DISABLED_LEGACY_ENGINE
1328 }
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:136
Definition: werd.h:35
void ResetWordIterator()
Definition: pageres.cpp:1533
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
WERD_RES * word() const
Definition: pageres.h:751
Definition: werd.h:59
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: werd.h:34
PAGE_RES * page_res
Definition: pageres.h:677
float x_height
Definition: pageres.h:311
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:182
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1269
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:119
#define ASSERT_HOST(x)
Definition: errcode.h:84
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1338
WERD * word
Definition: pageres.h:189

◆ ClassifyBlobPlusOutlines()

float tesseract::Tesseract::ClassifyBlobPlusOutlines ( const GenericVector< bool > &  ok_outlines,
const GenericVector< C_OUTLINE *> &  outlines,
int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str 
)

Definition at line 1239 of file control.cpp.

1242  {
1243 #ifndef DISABLED_LEGACY_ENGINE
1244  C_OUTLINE_IT ol_it;
1245  C_OUTLINE* first_to_keep = nullptr;
1246  C_BLOB* local_blob = nullptr;
1247  if (blob != nullptr) {
1248  // Add the required outlines to the blob.
1249  ol_it.set_to_list(blob->out_list());
1250  first_to_keep = ol_it.data();
1251  }
1252  for (int i = 0; i < ok_outlines.size(); ++i) {
1253  if (ok_outlines[i]) {
1254  // This outline is to be added.
1255  if (blob == nullptr) {
1256  local_blob = new C_BLOB(outlines[i]);
1257  blob = local_blob;
1258  ol_it.set_to_list(blob->out_list());
1259  } else {
1260  ol_it.add_before_stay_put(outlines[i]);
1261  }
1262  }
1263  }
1264  float c2;
1265  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1266  ol_it.move_to_first();
1267  if (first_to_keep == nullptr) {
1268  // We created blob. Empty its outlines and delete it.
1269  for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1270  delete local_blob;
1271  cert = -c2;
1272  } else {
1273  // Remove the outlines that we put in.
1274  for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1275  ol_it.extract();
1276  }
1277  }
1278  return cert;
1279 #else
1280  return 0.1;
1281 #endif // ndef DISABLED_LEGACY_ENGINE
1282 }
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1287
int size() const
Definition: genericvector.h:71
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70

◆ Clear()

void tesseract::Tesseract::Clear ( )

Definition at line 569 of file tesseractclass.cpp.

569  {
570  STRING debug_name = imagebasename + "_debug.pdf";
571  pixa_debug_.WritePDF(debug_name.string());
572  pixDestroy(&pix_binary_);
573  pixDestroy(&pix_grey_);
574  pixDestroy(&pix_thresholds_);
575  pixDestroy(&scaled_color_);
576  deskew_ = FCOORD(1.0f, 0.0f);
577  reskew_ = FCOORD(1.0f, 0.0f);
578  splitter_.Clear();
579  scaled_factor_ = -1;
580  for (int i = 0; i < sub_langs_.size(); ++i)
581  sub_langs_[i]->Clear();
582 }
const char * string() const
Definition: strngs.cpp:196
STRING imagebasename
Definition: ccutil.h:65
void WritePDF(const char *filename)
Definition: debugpixa.h:36
Definition: strngs.h:45
Definition: points.h:189

◆ ComputeCompatibleXheight()

float tesseract::Tesseract::ComputeCompatibleXheight ( WERD_RES word_res,
float *  baseline_shift 
)

Definition at line 102 of file fixxht.cpp.

103  {
104  STATS top_stats(0, UINT8_MAX);
105  STATS shift_stats(-UINT8_MAX, UINT8_MAX);
106  int bottom_shift = 0;
107  int num_blobs = word_res->rebuild_word->NumBlobs();
108  do {
109  top_stats.clear();
110  shift_stats.clear();
111  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
112  TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
113  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
114  if (unicharset.get_isalpha(class_id) ||
115  unicharset.get_isdigit(class_id)) {
116  int top = blob->bounding_box().top() + bottom_shift;
117  // Clip the top to the limit of normalized feature space.
118  if (top >= INT_FEAT_RANGE)
119  top = INT_FEAT_RANGE - 1;
120  int bottom = blob->bounding_box().bottom() + bottom_shift;
121  int min_bottom, max_bottom, min_top, max_top;
122  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
123  &min_top, &max_top);
124  // Chars with a wild top range would mess up the result so ignore them.
125  if (max_top - min_top > kMaxCharTopRange)
126  continue;
127  int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
128  top - (max_top + x_ht_acceptance_tolerance));
129  int height = top - kBlnBaselineOffset;
130  if (debug_x_ht_level >= 2) {
131  tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
132  unicharset.id_to_unichar(class_id),
133  height, min_bottom, max_bottom, min_top, max_top,
134  bottom, top);
135  }
136  // Use only chars that fit in the expected bottom range, and where
137  // the range of tops is sensibly near the xheight.
138  if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
139  bottom - x_ht_acceptance_tolerance <= max_bottom &&
140  min_top > kBlnBaselineOffset &&
141  max_top - kBlnBaselineOffset >= kBlnXHeight &&
142  misfit_dist > 0) {
143  // Compute the x-height position using proportionality between the
144  // actual height and expected height.
145  int min_xht = DivRounded(height * kBlnXHeight,
146  max_top - kBlnBaselineOffset);
147  int max_xht = DivRounded(height * kBlnXHeight,
148  min_top - kBlnBaselineOffset);
149  if (debug_x_ht_level >= 2) {
150  tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
151  }
152  // The range of expected heights gets a vote equal to the distance
153  // of the actual top from the expected top.
154  for (int y = min_xht; y <= max_xht; ++y)
155  top_stats.add(y, misfit_dist);
156  } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
157  bottom - x_ht_acceptance_tolerance > max_bottom) &&
158  bottom_shift == 0) {
159  // Get the range of required bottom shift.
160  int min_shift = min_bottom - bottom;
161  int max_shift = max_bottom - bottom;
162  if (debug_x_ht_level >= 2) {
163  tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
164  }
165  // The range of expected shifts gets a vote equal to the min distance
166  // of the actual bottom from the expected bottom, spread over the
167  // range of its acceptance.
168  int misfit_weight = abs(min_shift);
169  if (max_shift > min_shift)
170  misfit_weight /= max_shift - min_shift;
171  for (int y = min_shift; y <= max_shift; ++y)
172  shift_stats.add(y, misfit_weight);
173  } else {
174  if (bottom_shift == 0) {
175  // Things with bottoms that are already ok need to say so, on the
176  // 1st iteration only.
177  shift_stats.add(0, kBlnBaselineOffset);
178  }
179  if (debug_x_ht_level >= 2) {
180  tprintf(" already OK\n");
181  }
182  }
183  }
184  }
185  if (shift_stats.get_total() > top_stats.get_total()) {
186  bottom_shift = IntCastRounded(shift_stats.median());
187  if (debug_x_ht_level >= 2) {
188  tprintf("Applying bottom shift=%d\n", bottom_shift);
189  }
190  }
191  } while (bottom_shift != 0 &&
192  top_stats.get_total() < shift_stats.get_total());
193  // Baseline shift is opposite sign to the bottom shift.
194  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
195  if (debug_x_ht_level >= 2) {
196  tprintf("baseline shift=%g\n", *baseline_shift);
197  }
198  if (top_stats.get_total() == 0)
199  return bottom_shift != 0 ? word_res->x_height : 0.0f;
200  // The new xheight is just the median vote, which is then scaled out
201  // of BLN space back to pixel space to get the x-height in pixel space.
202  float new_xht = top_stats.median();
203  if (debug_x_ht_level >= 2) {
204  tprintf("Median xht=%f\n", new_xht);
205  tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
206  new_xht, new_xht / word_res->denorm.y_scale());
207  }
208  // The xheight must change by at least x_ht_min_change to be used.
209  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
210  return new_xht / word_res->denorm.y_scale();
211  else
212  return bottom_shift != 0 ? word_res->x_height : 0.0f;
213 }
TWERD * rebuild_word
Definition: pageres.h:260
int UNICHAR_ID
Definition: unichar.h:35
int NumBlobs() const
Definition: blobs.h:432
float y_scale() const
Definition: normalis.h:270
const int kBlnXHeight
Definition: normalis.h:24
const int kBlnBaselineOffset
Definition: normalis.h:25
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
Definition: statistc.h:33
int16_t top() const
Definition: rect.h:58
DENORM denorm
Definition: pageres.h:204
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
UNICHARSET unicharset
Definition: ccutil.h:68
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:563
int IntCastRounded(double x)
Definition: helpers.h:168
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
TBOX bounding_box() const
Definition: blobs.cpp:478
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
float x_height
Definition: pageres.h:311
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
Definition: blobs.h:268
int DivRounded(int a, int b)
Definition: helpers.h:162
const int kMaxCharTopRange
Definition: fixxht.cpp:67
#define INT_FEAT_RANGE
Definition: float2int.h:27
int16_t bottom() const
Definition: rect.h:65
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ convert_bad_unlv_chs()

void tesseract::Tesseract::convert_bad_unlv_chs ( WERD_RES word_res)

Definition at line 660 of file docqual.cpp.

660  {
661  int i;
662  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
663  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
664  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
665  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
666  for (i = 0; i < word_res->reject_map.length(); ++i) {
667  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
668  word_res->best_choice->set_unichar_id(unichar_dash, i);
669  if (word_res->reject_map[i].accepted ())
670  word_res->reject_map[i].setrej_unlv_rej ();
671  }
672  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
673  word_res->best_choice->set_unichar_id(unichar_space, i);
674  if (word_res->reject_map[i].accepted ())
675  word_res->reject_map[i].setrej_unlv_rej ();
676  }
677  }
678 }
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359
int UNICHAR_ID
Definition: unichar.h:35
REJMAP reject_map
Definition: pageres.h:287
int32_t length() const
Definition: rejctmap.h:223
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
const UNICHARSET * uch_set
Definition: pageres.h:206
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ ConvertStringToUnichars()

bool tesseract::Tesseract::ConvertStringToUnichars ( const char *  utf8,
GenericVector< UNICHAR_ID > *  class_ids 
)

◆ CorrectClassifyWords()

void tesseract::Tesseract::CorrectClassifyWords ( PAGE_RES page_res)

◆ count_alphanums() [1/2]

int16_t tesseract::Tesseract::count_alphanums ( const WERD_CHOICE word)

Definition at line 383 of file output.cpp.

383  {
384  int count = 0;
385  for (int i = 0; i < word.length(); ++i) {
386  if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
387  word.unicharset()->get_isdigit(word.unichar_id(i)))
388  count++;
389  }
390  return count;
391 }
int count(LIST var_list)
Definition: oldlist.cpp:98
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303

◆ count_alphanums() [2/2]

int16_t tesseract::Tesseract::count_alphanums ( WERD_RES word)

Definition at line 563 of file reject.cpp.

563  {
564  int count = 0;
565  const WERD_CHOICE *best_choice = word_res->best_choice;
566  for (int i = 0; i < word_res->reject_map.length(); ++i) {
567  if ((word_res->reject_map[i].accepted()) &&
568  (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
569  word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
570  count++;
571  }
572  }
573  return count;
574 }
int count(LIST var_list)
Definition: oldlist.cpp:98
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315

◆ count_alphas()

int16_t tesseract::Tesseract::count_alphas ( const WERD_CHOICE word)

Definition at line 373 of file output.cpp.

373  {
374  int count = 0;
375  for (int i = 0; i < word.length(); ++i) {
376  if (word.unicharset()->get_isalpha(word.unichar_id(i)))
377  count++;
378  }
379  return count;
380 }
int count(LIST var_list)
Definition: oldlist.cpp:98
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303

◆ count_outline_errs()

int16_t tesseract::Tesseract::count_outline_errs ( char  c,
int16_t  outline_count 
)

Definition at line 127 of file docqual.cpp.

127  {
128  int expected_outline_count;
129 
130  if (STRING (outlines_odd).contains (c))
131  return 0; // Don't use this char
132  else if (STRING (outlines_2).contains (c))
133  expected_outline_count = 2;
134  else
135  expected_outline_count = 1;
136  return abs (outline_count - expected_outline_count);
137 }
Definition: strngs.h:45

◆ CountMisfitTops()

int tesseract::Tesseract::CountMisfitTops ( WERD_RES word_res)

Definition at line 70 of file fixxht.cpp.

70  {
71  int bad_blobs = 0;
72  int num_blobs = word_res->rebuild_word->NumBlobs();
73  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
74  TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
75  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
76  if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
77  int top = blob->bounding_box().top();
78  if (top >= INT_FEAT_RANGE)
79  top = INT_FEAT_RANGE - 1;
80  int min_bottom, max_bottom, min_top, max_top;
81  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
82  &min_top, &max_top);
83  if (max_top - min_top > kMaxCharTopRange)
84  continue;
85  bool bad = top < min_top - x_ht_acceptance_tolerance ||
86  top > max_top + x_ht_acceptance_tolerance;
87  if (bad)
88  ++bad_blobs;
89  if (debug_x_ht_level >= 1) {
90  tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
91  unicharset.id_to_unichar(class_id),
92  bad ? "Misfit" : "OK", top, min_top, max_top,
93  static_cast<int>(x_ht_acceptance_tolerance));
94  }
95  }
96  }
97  return bad_blobs;
98 }
TWERD * rebuild_word
Definition: pageres.h:260
int UNICHAR_ID
Definition: unichar.h:35
int NumBlobs() const
Definition: blobs.h:432
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
int16_t top() const
Definition: rect.h:58
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
UNICHARSET unicharset
Definition: ccutil.h:68
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:563
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
TBOX bounding_box() const
Definition: blobs.cpp:478
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
Definition: blobs.h:268
const int kMaxCharTopRange
Definition: fixxht.cpp:67
#define INT_FEAT_RANGE
Definition: float2int.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ debug_word()

void tesseract::Tesseract::debug_word ( PAGE_RES page_res,
const TBOX selection_box 
)

debug_word

Process the whole image, but load word_config_ for the selected word(s).

Definition at line 637 of file pgedit.cpp.

637  {
638 #ifndef DISABLED_LEGACY_ENGINE
640 #endif
641  recog_all_words(page_res, nullptr, &selection_box, word_config_.string(), 0);
642 }
const char * string() const
Definition: strngs.cpp:196
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:308

◆ dictionary_correction_pass()

void tesseract::Tesseract::dictionary_correction_pass ( PAGE_RES page_res)

Definition at line 2117 of file control.cpp.

2117  {
2118  PAGE_RES_IT word_it(page_res);
2119  for (WERD_RES* word = word_it.word(); word != nullptr;
2120  word = word_it.forward()) {
2121  if (word->best_choices.singleton())
2122  continue; // There are no alternates.
2123 
2124  const WERD_CHOICE* best = word->best_choice;
2125  if (word->tesseract->getDict().valid_word(*best) != 0)
2126  continue; // The best choice is in the dictionary.
2127 
2128  WERD_CHOICE_IT choice_it(&word->best_choices);
2129  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2130  choice_it.forward()) {
2131  WERD_CHOICE* alternate = choice_it.data();
2132  if (word->tesseract->getDict().valid_word(*alternate)) {
2133  // The alternate choice is in the dictionary.
2134  if (tessedit_bigram_debug) {
2135  tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2136  best->unichar_string().string(),
2137  alternate->unichar_string().string());
2138  }
2139  // Replace the 'best' choice with a better choice.
2140  word->ReplaceBestChoice(alternate);
2141  break;
2142  }
2143  }
2144  }
2145 }
const char * string() const
Definition: strngs.cpp:196
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD * word
Definition: pageres.h:189

◆ digit_or_numeric_punct()

bool tesseract::Tesseract::digit_or_numeric_punct ( WERD_RES word,
int  char_position 
)

Definition at line 373 of file fixspace.cpp.

373  {
374  int i;
375  int offset;
376 
377  for (i = 0, offset = 0; i < char_position;
378  offset += word->best_choice->unichar_lengths()[i++]);
379  return (
380  word->uch_set->get_isdigit(
381  word->best_choice->unichar_string().string() + offset,
382  word->best_choice->unichar_lengths()[i]) ||
383  (word->best_choice->permuter() == NUMBER_PERM &&
385  word->best_choice->unichar_string().string()[offset])));
386 }
const char * string() const
Definition: strngs.cpp:196
uint8_t permuter() const
Definition: ratngs.h:346
const STRING & unichar_lengths() const
Definition: ratngs.h:548
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
Definition: strngs.h:45
bool contains(const char c) const
Definition: strngs.cpp:187
const UNICHARSET * uch_set
Definition: pageres.h:206
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ do_re_display()

void tesseract::Tesseract::do_re_display ( bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_painter)

do_re_display()

Redisplay page

Definition at line 298 of file pgedit.cpp.

299  {
300  int block_count = 1;
301 
302  image_win->Clear();
303  if (display_image != 0) {
304  image_win->Image(pix_binary_, 0, 0);
305  }
306 
307  image_win->Brush(ScrollView::NONE);
309  for (WERD_RES* word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
310  (this->*word_painter)(&pr_it);
311  if (display_baselines && pr_it.row() != pr_it.prev_row())
312  pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
313  if (display_blocks && pr_it.block() != pr_it.prev_block())
314  pr_it.block()->block->pdblk.plot(image_win, block_count++, ScrollView::RED);
315  }
316  image_win->Update();
317 }
BOOL8 display_baselines
Definition: pgedit.cpp:119
BOOL8 display_image
Definition: pgedit.cpp:117
PAGE_RES * current_page_res
Definition: pgedit.cpp:121
BOOL8 display_blocks
Definition: pgedit.cpp:118
static void Update()
Definition: scrollview.cpp:711
void Image(struct Pix *image, int x_pos, int y_pos)
Definition: scrollview.cpp:768
void Clear()
Definition: scrollview.cpp:591
WERD * word
Definition: pageres.h:189
void Brush(Color color)
Definition: scrollview.cpp:728

◆ doc_and_block_rejection()

void tesseract::Tesseract::doc_and_block_rejection ( PAGE_RES_IT page_res_it,
bool  good_quality_doc 
)

Definition at line 233 of file docqual.cpp.

235  {
236  int16_t block_no = 0;
237  int16_t row_no = 0;
238  BLOCK_RES *current_block;
239  ROW_RES *current_row;
240 
241  bool rej_word;
242  bool prev_word_rejected;
243  int16_t char_quality = 0;
244  int16_t accepted_char_quality;
245 
246  if (page_res_it.page_res->rej_count * 100.0 /
248  reject_whole_page(page_res_it);
250  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
251  page_res_it.page_res->char_count,
252  page_res_it.page_res->rej_count);
253  }
254  } else {
256  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
257  page_res_it.page_res->char_count,
258  page_res_it.page_res->rej_count);
259  }
260 
261  /* Walk blocks testing for block rejection */
262 
263  page_res_it.restart_page();
264  WERD_RES* word;
265  while ((word = page_res_it.word()) != nullptr) {
266  current_block = page_res_it.block();
267  block_no = current_block->block->pdblk.index();
268  if (current_block->char_count > 0 &&
269  (current_block->rej_count * 100.0 / current_block->char_count) >
272  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
273  block_no, current_block->char_count,
274  current_block->rej_count);
275  }
276  prev_word_rejected = false;
277  while ((word = page_res_it.word()) != nullptr &&
278  (page_res_it.block() == current_block)) {
280  rej_word = word->reject_map.reject_count() > 0 ||
282  if (rej_word && tessedit_dont_blkrej_good_wds &&
285  *word->uch_set,
286  word->best_choice->unichar_string().string(),
287  word->best_choice->unichar_lengths().string()) !=
288  AC_UNACCEPTABLE) {
289  word_char_quality(word, page_res_it.row()->row,
290  &char_quality,
291  &accepted_char_quality);
292  rej_word = char_quality != word->reject_map.length();
293  }
294  } else {
295  rej_word = true;
296  }
297  if (rej_word) {
298  /*
299  Reject spacing if both current and prev words are rejected.
300  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
301  generated more space errors.
302  */
304  prev_word_rejected &&
305  page_res_it.prev_row() == page_res_it.row() &&
306  word->word->space() == 1)
307  word->reject_spaces = true;
309  }
310  prev_word_rejected = rej_word;
311  page_res_it.forward();
312  }
313  } else {
315  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
316  block_no, page_res_it.block()->char_count,
317  page_res_it.block()->rej_count);
318  }
319 
320  /* Walk rows in block testing for row rejection */
321  row_no = 0;
322  while (page_res_it.word() != nullptr &&
323  page_res_it.block() == current_block) {
324  current_row = page_res_it.row();
325  row_no++;
326  /* Reject whole row if:
327  fraction of chars on row which are rejected exceed a limit AND
328  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
329  limit
330  */
331  if (current_row->char_count > 0 &&
332  (current_row->rej_count * 100.0 / current_row->char_count) >
334  (current_row->whole_word_rej_count * 100.0 /
335  current_row->rej_count) <
338  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
339  row_no, current_row->char_count,
340  current_row->rej_count);
341  }
342  prev_word_rejected = false;
343  while ((word = page_res_it.word()) != nullptr &&
344  page_res_it.row () == current_row) {
345  /* Preserve words on good docs unless they are mostly rejected*/
346  if (!tessedit_row_rej_good_docs && good_quality_doc) {
347  rej_word = word->reject_map.reject_count() /
348  static_cast<float>(word->reject_map.length()) >
351  /* Preserve perfect words anyway */
352  rej_word = word->reject_map.reject_count() > 0 ||
354  if (rej_word && tessedit_dont_rowrej_good_wds &&
357  word->best_choice->unichar_string().string(),
358  word->best_choice->unichar_lengths().string()) !=
359  AC_UNACCEPTABLE) {
360  word_char_quality(word, page_res_it.row()->row,
361  &char_quality,
362  &accepted_char_quality);
363  rej_word = char_quality != word->reject_map.length();
364  }
365  } else {
366  rej_word = true;
367  }
368  if (rej_word) {
369  /*
370  Reject spacing if both current and prev words are rejected.
371  NOTE - this is NOT restricted to FUZZY spaces. - When tried
372  this generated more space errors.
373  */
375  prev_word_rejected &&
376  page_res_it.prev_row() == page_res_it.row() &&
377  word->word->space () == 1)
378  word->reject_spaces = true;
380  }
381  prev_word_rejected = rej_word;
382  page_res_it.forward();
383  }
384  } else {
386  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
387  row_no, current_row->char_count, current_row->rej_count);
388  }
389  while (page_res_it.word() != nullptr &&
390  page_res_it.row() == current_row)
391  page_res_it.forward();
392  }
393  }
394  }
395  }
396  }
397 }
BLOCK_RES * block() const
Definition: pageres.h:757
void rej_word_block_rej()
Definition: rejctmap.cpp:435
int32_t rej_count
Definition: pageres.h:80
ROW_RES * row() const
Definition: pageres.h:754
int32_t whole_word_rej_count
Definition: pageres.h:146
REJMAP reject_map
Definition: pageres.h:287
void rej_word_row_rej()
Definition: rejctmap.cpp:444
const char * string() const
Definition: strngs.cpp:196
int32_t char_count
Definition: pageres.h:118
int32_t length() const
Definition: rejctmap.h:223
double tessedit_reject_doc_percent
double tessedit_reject_block_percent
uint8_t space()
Definition: werd.h:102
double tessedit_reject_row_percent
bool tessedit_preserve_blk_rej_perfect_wds
double tessedit_good_doc_still_rowrej_wd
WERD_RES * restart_page()
Definition: pageres.h:698
BLOCK * block
Definition: pageres.h:117
int index() const
Definition: pdblock.h:68
const STRING & unichar_lengths() const
Definition: ratngs.h:548
ROW_RES * prev_row() const
Definition: pageres.h:745
double tessedit_whole_wd_rej_row_percent
int16_t reject_count()
Definition: rejctmap.h:229
bool tessedit_preserve_row_rej_perfect_wds
WERD_RES * word() const
Definition: pageres.h:751
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int32_t char_count
Definition: pageres.h:79
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1764
PAGE_RES * page_res
Definition: pageres.h:677
bool reject_spaces
Definition: pageres.h:336
int32_t rej_count
Definition: pageres.h:119
const UNICHARSET * uch_set
Definition: pageres.h:206
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_RES * forward()
Definition: pageres.h:731
int32_t char_count
Definition: pageres.h:144
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:407
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:93
int32_t rej_count
Definition: pageres.h:145
Unacceptable word.
Definition: control.h:30
PDBLK pdblk
Definition: ocrblock.h:192
WERD_CHOICE * best_choice
Definition: pageres.h:235
ROW * row
Definition: pageres.h:143
WERD * word
Definition: pageres.h:189

◆ dont_allow_1Il()

void tesseract::Tesseract::dont_allow_1Il ( WERD_RES word)

Definition at line 531 of file reject.cpp.

531  {
532  int i = 0;
533  int offset;
534  int word_len = word->reject_map.length();
535  const char *s = word->best_choice->unichar_string().string();
536  const char *lengths = word->best_choice->unichar_lengths().string();
537  bool accepted_1Il = false;
538 
539  for (i = 0, offset = 0; i < word_len;
540  offset += word->best_choice->unichar_lengths()[i++]) {
541  if (word->reject_map[i].accepted()) {
542  if (STRING(conflict_set_I_l_1).contains(s[offset])) {
543  accepted_1Il = true;
544  } else {
545  if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
546  word->uch_set->get_isdigit(s + offset, lengths[i]))
547  return; // >=1 non 1Il ch accepted
548  }
549  }
550  }
551  if (!accepted_1Il)
552  return; //Nothing to worry about
553 
554  for (i = 0, offset = 0; i < word_len;
555  offset += word->best_choice->unichar_lengths()[i++]) {
556  if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
557  word->reject_map[i].accepted())
558  word->reject_map[i].setrej_postNN_1Il();
559  }
560 }
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
int32_t length() const
Definition: rejctmap.h:223
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
const STRING & unichar_lengths() const
Definition: ratngs.h:548
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
Definition: strngs.h:45
bool contains(const char c) const
Definition: strngs.cpp:187
const UNICHARSET * uch_set
Definition: pageres.h:206
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ dump_words()

void tesseract::Tesseract::dump_words ( WERD_RES_LIST &  perm,
int16_t  score,
int16_t  mode,
bool  improved 
)

Definition at line 479 of file fixspace.cpp.

480  {
481  WERD_RES_IT word_res_it(&perm);
482 
483  if (debug_fix_space_level > 0) {
484  if (mode == 1) {
485  stats_.dump_words_str = "";
486  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
487  word_res_it.forward()) {
488  if (!word_res_it.data()->part_of_combo) {
489  stats_.dump_words_str +=
490  word_res_it.data()->best_choice->unichar_string();
491  stats_.dump_words_str += ' ';
492  }
493  }
494  }
495 
496  if (debug_fix_space_level > 1) {
497  switch (mode) {
498  case 1:
499  tprintf("EXTRACTED (%d): \"", score);
500  break;
501  case 2:
502  tprintf("TESTED (%d): \"", score);
503  break;
504  case 3:
505  tprintf("RETURNED (%d): \"", score);
506  break;
507  }
508 
509  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
510  word_res_it.forward()) {
511  if (!word_res_it.data()->part_of_combo) {
512  tprintf("%s/%1d ",
513  word_res_it.data()->best_choice->unichar_string().string(),
514  (int)word_res_it.data()->best_choice->permuter());
515  }
516  }
517  tprintf("\"\n");
518  } else if (improved) {
519  tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
520  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
521  word_res_it.forward()) {
522  if (!word_res_it.data()->part_of_combo) {
523  tprintf("%s/%1d ",
524  word_res_it.data()->best_choice->unichar_string().string(),
525  (int)word_res_it.data()->best_choice->permuter());
526  }
527  }
528  tprintf("\"\n");
529  }
530  }
531 }
const char * string() const
Definition: strngs.cpp:196
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

◆ end_tesseract()

void tesseract::Tesseract::end_tesseract ( )

Definition at line 475 of file tessedit.cpp.

475  {
476  end_recog();
477 }

◆ eval_word_spacing()

int16_t tesseract::Tesseract::eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 269 of file fixspace.cpp.

269  {
270  WERD_RES_IT word_res_it(&word_res_list);
271  int16_t total_score = 0;
272  int16_t word_count = 0;
273  int16_t done_word_count = 0;
274  int16_t word_len;
275  int16_t i;
276  int16_t offset;
277  WERD_RES *word; // current word
278  int16_t prev_word_score = 0;
279  bool prev_word_done = false;
280  bool prev_char_1 = false; // prev ch a "1/I/l"?
281  bool prev_char_digit = false; // prev ch 2..9 or 0
282  bool current_char_1 = false;
283  bool current_word_ok_so_far;
284  STRING punct_chars = "!\"`',.:;";
285  bool prev_char_punct = false;
286  bool current_char_punct = false;
287  bool word_done = false;
288 
289  do {
290  word = word_res_it.data();
291  word_done = fixspace_thinks_word_done(word);
292  word_count++;
293  if (word->tess_failed) {
294  total_score += prev_word_score;
295  if (prev_word_done)
296  done_word_count++;
297  prev_word_score = 0;
298  prev_char_1 = false;
299  prev_char_digit = false;
300  prev_word_done = false;
301  } else {
302  /*
303  Can we add the prev word score and potentially count this word?
304  Yes IF it didn't end in a 1 when the first char of this word is a digit
305  AND it didn't end in a digit when the first char of this word is a 1
306  */
307  word_len = word->reject_map.length();
308  current_word_ok_so_far = false;
309  if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
310  (prev_char_digit && (
311  (word_done &&
312  word->best_choice->unichar_lengths().string()[0] == 1 &&
313  word->best_choice->unichar_string()[0] == '1') ||
314  (!word_done && STRING(conflict_set_I_l_1).contains(
315  word->best_choice->unichar_string()[0])))))) {
316  total_score += prev_word_score;
317  if (prev_word_done)
318  done_word_count++;
319  current_word_ok_so_far = word_done;
320  }
321 
322  if (current_word_ok_so_far) {
323  prev_word_done = true;
324  prev_word_score = word_len;
325  } else {
326  prev_word_done = false;
327  prev_word_score = 0;
328  }
329 
330  /* Add 1 to total score for every joined 1 regardless of context and
331  rejtn */
332  for (i = 0, prev_char_1 = false; i < word_len; i++) {
333  current_char_1 = word->best_choice->unichar_string()[i] == '1';
334  if (prev_char_1 || (current_char_1 && (i > 0)))
335  total_score++;
336  prev_char_1 = current_char_1;
337  }
338 
339  /* Add 1 to total score for every joined punctuation regardless of context
340  and rejtn */
342  for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
343  offset += word->best_choice->unichar_lengths()[i++]) {
344  current_char_punct =
345  punct_chars.contains(word->best_choice->unichar_string()[offset]);
346  if (prev_char_punct || (current_char_punct && i > 0))
347  total_score++;
348  prev_char_punct = current_char_punct;
349  }
350  }
351  prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
352  for (i = 0, offset = 0; i < word_len - 1;
353  offset += word->best_choice->unichar_lengths()[i++]);
354  prev_char_1 =
355  ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
356  || (!word_done && STRING(conflict_set_I_l_1).contains(
357  word->best_choice->unichar_string()[offset])));
358  }
359  /* Find next word */
360  do {
361  word_res_it.forward();
362  } while (word_res_it.data()->part_of_combo);
363  } while (!word_res_it.at_first());
364  total_score += prev_word_score;
365  if (prev_word_done)
366  done_word_count++;
367  if (done_word_count == word_count)
368  return PERFECT_WERDS;
369  else
370  return total_score;
371 }
bool tess_failed
Definition: pageres.h:288
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:373
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
int32_t length() const
Definition: rejctmap.h:223
const STRING & unichar_lengths() const
Definition: ratngs.h:548
#define PERFECT_WERDS
Definition: fixspace.cpp:46
bool fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:533
Definition: strngs.h:45
bool contains(const char c) const
Definition: strngs.cpp:187
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ failure_count()

int16_t tesseract::Tesseract::failure_count ( WERD_RES word)

Definition at line 966 of file docqual.cpp.

966  {
967  const char *str = word->best_choice->unichar_string().string();
968  int tess_rejs = 0;
969 
970  for (; *str != '\0'; str++) {
971  if (*str == ' ')
972  tess_rejs++;
973  }
974  return tess_rejs;
975 }
const char * string() const
Definition: strngs.cpp:196
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ FindSegmentation()

bool tesseract::Tesseract::FindSegmentation ( const GenericVector< UNICHAR_ID > &  target_text,
WERD_RES word_res 
)

◆ first_alphanum_index()

int16_t tesseract::Tesseract::first_alphanum_index ( const char *  word,
const char *  word_lengths 
)

Definition at line 474 of file reject.cpp.

475  {
476  int16_t i;
477  int16_t offset;
478 
479  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
480  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
481  unicharset.get_isdigit(word + offset, word_lengths[i]))
482  return i;
483  }
484  return -1;
485 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
UNICHARSET unicharset
Definition: ccutil.h:68

◆ first_alphanum_offset()

int16_t tesseract::Tesseract::first_alphanum_offset ( const char *  word,
const char *  word_lengths 
)

Definition at line 487 of file reject.cpp.

488  {
489  int16_t i;
490  int16_t offset;
491 
492  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
493  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
494  unicharset.get_isdigit(word + offset, word_lengths[i]))
495  return offset;
496  }
497  return -1;
498 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
UNICHARSET unicharset
Definition: ccutil.h:68

◆ fix_fuzzy_space_list()

void tesseract::Tesseract::fix_fuzzy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 175 of file fixspace.cpp.

177  {
178  int16_t best_score;
179  WERD_RES_LIST current_perm;
180  int16_t current_score;
181  bool improved = false;
182 
183  best_score = eval_word_spacing(best_perm); // default score
184  dump_words(best_perm, best_score, 1, improved);
185 
186  if (best_score != PERFECT_WERDS)
187  initialise_search(best_perm, current_perm);
188 
189  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
190  match_current_words(current_perm, row, block);
191  current_score = eval_word_spacing(current_perm);
192  dump_words(current_perm, current_score, 2, improved);
193  if (current_score > best_score) {
194  best_perm.clear();
195  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
196  best_score = current_score;
197  improved = true;
198  }
199  if (current_score < PERFECT_WERDS)
200  transform_to_next_perm(current_perm);
201  }
202  dump_words(best_perm, best_score, 3, improved);
203 }
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:649
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:269
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:479
#define PERFECT_WERDS
Definition: fixspace.cpp:46
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:207
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:226
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:402

◆ fix_fuzzy_spaces()

void tesseract::Tesseract::fix_fuzzy_spaces ( ETEXT_DESC monitor,
int32_t  word_count,
PAGE_RES page_res 
)

Definition at line 78 of file fixspace.cpp.

80  {
81  BLOCK_RES_IT block_res_it;
82  ROW_RES_IT row_res_it;
83  WERD_RES_IT word_res_it_from;
84  WERD_RES_IT word_res_it_to;
85  WERD_RES *word_res;
86  WERD_RES_LIST fuzzy_space_words;
87  int16_t new_length;
88  bool prevent_null_wd_fixsp; // DON'T process blobless wds
89  int32_t word_index; // current word
90 
91  block_res_it.set_to_list(&page_res->block_res_list);
92  word_index = 0;
93  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
94  block_res_it.forward()) {
95  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
96  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
97  row_res_it.forward()) {
98  word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
99  while (!word_res_it_from.at_last()) {
100  word_res = word_res_it_from.data();
101  while (!word_res_it_from.at_last() &&
102  !(word_res->combination ||
103  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
104  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
105  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
106  block_res_it.data()->block);
107  word_res = word_res_it_from.forward();
108  word_index++;
109  if (monitor != nullptr) {
110  monitor->ocr_alive = TRUE;
111  monitor->progress = 90 + 5 * word_index / word_count;
112  if (monitor->deadline_exceeded() ||
113  (monitor->cancel != nullptr &&
114  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
115  return;
116  }
117  }
118 
119  if (!word_res_it_from.at_last()) {
120  word_res_it_to = word_res_it_from;
121  prevent_null_wd_fixsp =
122  word_res->word->cblob_list()->empty();
123  if (check_debug_pt(word_res, 60))
124  debug_fix_space_level.set_value(10);
125  word_res_it_to.forward();
126  word_index++;
127  if (monitor != nullptr) {
128  monitor->ocr_alive = TRUE;
129  monitor->progress = 90 + 5 * word_index / word_count;
130  if (monitor->deadline_exceeded() ||
131  (monitor->cancel != nullptr &&
132  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
133  return;
134  }
135  while (!word_res_it_to.at_last () &&
136  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
137  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
138  if (check_debug_pt(word_res, 60))
139  debug_fix_space_level.set_value(10);
140  if (word_res->word->cblob_list()->empty())
141  prevent_null_wd_fixsp = true;
142  word_res = word_res_it_to.forward();
143  }
144  if (check_debug_pt(word_res, 60))
145  debug_fix_space_level.set_value(10);
146  if (word_res->word->cblob_list()->empty())
147  prevent_null_wd_fixsp = true;
148  if (prevent_null_wd_fixsp) {
149  word_res_it_from = word_res_it_to;
150  } else {
151  fuzzy_space_words.assign_to_sublist(&word_res_it_from,
152  &word_res_it_to);
153  fix_fuzzy_space_list(fuzzy_space_words,
154  row_res_it.data()->row,
155  block_res_it.data()->block);
156  new_length = fuzzy_space_words.length();
157  word_res_it_from.add_list_before(&fuzzy_space_words);
158  for (;
159  !word_res_it_from.at_last() && new_length > 0;
160  new_length--) {
161  word_res_it_from.forward();
162  }
163  }
164  if (test_pt)
165  debug_fix_space_level.set_value(0);
166  }
167  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
168  block_res_it.data()->block);
169  // Last word in row
170  }
171  }
172  }
173 }
#define TRUE
Definition: capi.h:51
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:132
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:565
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:127
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81
bool deadline_exceeded() const
Definition: ocrclass.h:164
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:175
bool combination
Definition: pageres.h:334
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:129
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:122

◆ fix_noisy_space_list()

void tesseract::Tesseract::fix_noisy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 599 of file fixspace.cpp.

600  {
601  int16_t best_score;
602  WERD_RES_IT best_perm_it(&best_perm);
603  WERD_RES_LIST current_perm;
604  WERD_RES_IT current_perm_it(&current_perm);
605  WERD_RES *old_word_res;
606  int16_t current_score;
607  bool improved = false;
608 
609  best_score = fp_eval_word_spacing(best_perm); // default score
610 
611  dump_words(best_perm, best_score, 1, improved);
612 
613  old_word_res = best_perm_it.data();
614  // Even deep_copy doesn't copy the underlying WERD unless its combination
615  // flag is true!.
616  old_word_res->combination = true; // Kludge to force deep copy
617  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
618  old_word_res->combination = false; // Undo kludge
619 
620  break_noisiest_blob_word(current_perm);
621 
622  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
623  match_current_words(current_perm, row, block);
624  current_score = fp_eval_word_spacing(current_perm);
625  dump_words(current_perm, current_score, 2, improved);
626  if (current_score > best_score) {
627  best_perm.clear();
628  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
629  best_score = current_score;
630  improved = true;
631  }
632  if (current_score < PERFECT_WERDS) {
633  break_noisiest_blob_word(current_perm);
634  }
635  }
636  dump_words(best_perm, best_score, 3, improved);
637 }
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:649
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:479
#define PERFECT_WERDS
Definition: fixspace.cpp:46
bool combination
Definition: pageres.h:334
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:226
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:645
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:860

◆ fix_rep_char()

void tesseract::Tesseract::fix_rep_char ( PAGE_RES_IT page_res_it)

fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.

Definition at line 1725 of file control.cpp.

1725  {
1726  WERD_RES *word_res = page_res_it->word();
1727  const WERD_CHOICE &word = *(word_res->best_choice);
1728 
1729  // Find the frequency of each unique character in the word.
1730  SortHelper<UNICHAR_ID> rep_ch(word.length());
1731  for (int i = 0; i < word.length(); ++i) {
1732  rep_ch.Add(word.unichar_id(i), 1);
1733  }
1734 
1735  // Find the most frequent result.
1736  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1737  int max_count = rep_ch.MaxCount(&maxch_id);
1738  // Find the best exemplar of a classifier result for maxch_id.
1739  BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1740  if (best_choice == nullptr) {
1741  tprintf("Failed to find a choice for %s, occurring %d times\n",
1742  word_res->uch_set->debug_str(maxch_id).string(), max_count);
1743  return;
1744  }
1745  word_res->done = TRUE;
1746 
1747  // Measure the mean space.
1748  int gap_count = 0;
1749  WERD* werd = word_res->word;
1750  C_BLOB_IT blob_it(werd->cblob_list());
1751  C_BLOB* prev_blob = blob_it.data();
1752  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1753  C_BLOB* blob = blob_it.data();
1754  int gap = blob->bounding_box().left();
1755  gap -= prev_blob->bounding_box().right();
1756  ++gap_count;
1757  prev_blob = blob;
1758  }
1759  // Just correct existing classification.
1760  CorrectRepcharChoices(best_choice, word_res);
1761  word_res->reject_map.initialise(word.length());
1762 }
int UNICHAR_ID
Definition: unichar.h:35
#define TRUE
Definition: capi.h:51
void Add(T value, int count)
Definition: sorthelper.h:65
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
int16_t left() const
Definition: rect.h:72
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
WERD_RES * word() const
Definition: pageres.h:751
Definition: werd.h:59
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
bool done
Definition: pageres.h:298
TBOX bounding_box() const
Definition: stepblob.cpp:255
const UNICHARSET * uch_set
Definition: pageres.h:206
WERD_CHOICE * best_choice
Definition: pageres.h:235
void initialise(int16_t length)
Definition: rejctmap.cpp:275
WERD * word
Definition: pageres.h:189

◆ fix_sp_fp_word()

void tesseract::Tesseract::fix_sp_fp_word ( WERD_RES_IT &  word_res_it,
ROW row,
BLOCK block 
)

Definition at line 565 of file fixspace.cpp.

566  {
567  WERD_RES *word_res;
568  WERD_RES_LIST sub_word_list;
569  WERD_RES_IT sub_word_list_it(&sub_word_list);
570  int16_t blob_index;
571  int16_t new_length;
572  float junk;
573 
574  word_res = word_res_it.data();
575  if (word_res->word->flag(W_REP_CHAR) ||
576  word_res->combination ||
577  word_res->part_of_combo ||
578  !word_res->word->flag(W_DONT_CHOP))
579  return;
580 
581  blob_index = worst_noise_blob(word_res, &junk);
582  if (blob_index < 0)
583  return;
584 
585  if (debug_fix_space_level > 1) {
586  tprintf("FP fixspace working on \"%s\"\n",
587  word_res->best_choice->unichar_string().string());
588  }
589  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
590  sub_word_list_it.add_after_stay_put(word_res_it.extract());
591  fix_noisy_space_list(sub_word_list, row, block);
592  new_length = sub_word_list.length();
593  word_res_it.add_list_before(&sub_word_list);
594  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
595  word_res_it.forward();
596  }
597 }
const char * string() const
Definition: strngs.cpp:196
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:93
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool combination
Definition: pageres.h:334
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:710
const STRING & unichar_string() const
Definition: ratngs.h:541
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:599
bool part_of_combo
Definition: pageres.h:335
WERD_CHOICE * best_choice
Definition: pageres.h:235
WERD * word
Definition: pageres.h:189

◆ fixspace_thinks_word_done()

bool tesseract::Tesseract::fixspace_thinks_word_done ( WERD_RES word)

Definition at line 533 of file fixspace.cpp.

533  {
534  if (word->done)
535  return true;
536 
537  /*
538  Use all the standard pass 2 conditions for mode 5 in set_done() in
539  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
540  CARE WHETHER WE HAVE of/at on/an etc.
541  */
542  if (fixsp_done_mode > 0 &&
543  (word->tess_accepted ||
544  (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
545  fixsp_done_mode == 3) &&
546  (strchr(word->best_choice->unichar_string().string(), ' ') == nullptr) &&
547  ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
548  (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
549  (word->best_choice->permuter() == USER_DAWG_PERM) ||
550  (word->best_choice->permuter() == NUMBER_PERM))) {
551  return true;
552  } else {
553  return false;
554  }
555 }
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
uint8_t permuter() const
Definition: ratngs.h:346
int16_t reject_count()
Definition: rejctmap.h:229
bool tess_accepted
Definition: pageres.h:296
bool done
Definition: pageres.h:298
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ flip_0O()

void tesseract::Tesseract::flip_0O ( WERD_RES word)

Definition at line 678 of file reject.cpp.

678  {
679  WERD_CHOICE *best_choice = word_res->best_choice;
680  int i;
681  TBOX out_box;
682 
683  if (!tessedit_flip_0O)
684  return;
685 
686  int num_blobs = word_res->rebuild_word->NumBlobs();
687  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
688  TBLOB* blob = word_res->rebuild_word->blobs[i];
689  if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
690  word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
691  out_box = blob->bounding_box();
692  if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
693  (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
694  return; //Beware words with sub/superscripts
695  }
696  }
697  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
698  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
699  if (unichar_0 == INVALID_UNICHAR_ID ||
700  !word_res->uch_set->get_enabled(unichar_0) ||
701  unichar_O == INVALID_UNICHAR_ID ||
702  !word_res->uch_set->get_enabled(unichar_O)) {
703  return; // 0 or O are not present/enabled in unicharset
704  }
705  for (i = 1; i < best_choice->length(); ++i) {
706  if (best_choice->unichar_id(i) == unichar_0 ||
707  best_choice->unichar_id(i) == unichar_O) {
708  /* A0A */
709  if ((i+1) < best_choice->length() &&
710  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
711  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
712  best_choice->set_unichar_id(unichar_O, i);
713  }
714  /* A00A */
715  if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
716  (i+1) < best_choice->length() &&
717  (best_choice->unichar_id(i+1) == unichar_0 ||
718  best_choice->unichar_id(i+1) == unichar_O) &&
719  (i+2) < best_choice->length() &&
720  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
721  best_choice->set_unichar_id(unichar_O, i);
722  i++;
723  }
724  /* AA0<non digit or end of word> */
725  if ((i > 1) &&
726  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
727  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
728  (((i+1) < best_choice->length() &&
729  !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
730  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
731  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
732  (i == best_choice->length() - 1))) {
733  best_choice->set_unichar_id(unichar_O, i);
734  }
735  /* 9O9 */
736  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
737  (i+1) < best_choice->length() &&
738  non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
739  best_choice->set_unichar_id(unichar_0, i);
740  }
741  /* 9OOO */
742  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
743  (i+2) < best_choice->length() &&
744  (best_choice->unichar_id(i+1) == unichar_0 ||
745  best_choice->unichar_id(i+1) == unichar_O) &&
746  (best_choice->unichar_id(i+2) == unichar_0 ||
747  best_choice->unichar_id(i+2) == unichar_O)) {
748  best_choice->set_unichar_id(unichar_0, i);
749  best_choice->set_unichar_id(unichar_0, i+1);
750  best_choice->set_unichar_id(unichar_0, i+2);
751  i += 2;
752  }
753  /* 9OO<non upper> */
754  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
755  (i+2) < best_choice->length() &&
756  (best_choice->unichar_id(i+1) == unichar_0 ||
757  best_choice->unichar_id(i+1) == unichar_O) &&
758  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
759  best_choice->set_unichar_id(unichar_0, i);
760  best_choice->set_unichar_id(unichar_0, i+1);
761  i++;
762  }
763  /* 9O<non upper> */
764  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
765  (i+1) < best_choice->length() &&
766  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
767  best_choice->set_unichar_id(unichar_0, i);
768  }
769  /* 9[.,]OOO.. */
770  if ((i > 1) &&
771  (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
772  word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
773  (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
774  best_choice->unichar_id(i-2) == unichar_O)) {
775  if (best_choice->unichar_id(i-2) == unichar_O) {
776  best_choice->set_unichar_id(unichar_0, i-2);
777  }
778  while (i < best_choice->length() &&
779  (best_choice->unichar_id(i) == unichar_O ||
780  best_choice->unichar_id(i) == unichar_0)) {
781  best_choice->set_unichar_id(unichar_0, i);
782  i++;
783  }
784  i--;
785  }
786  }
787  }
788 }
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359
int UNICHAR_ID
Definition: unichar.h:35
Definition: rect.h:34
const int kBlnXHeight
Definition: normalis.h:24
const int kBlnBaselineOffset
Definition: normalis.h:25
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:794
int16_t top() const
Definition: rect.h:58
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:790
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
TBOX bounding_box() const
Definition: blobs.cpp:478
int length() const
Definition: ratngs.h:303
Definition: blobs.h:268
int16_t bottom() const
Definition: rect.h:65

◆ flip_hyphens()

void tesseract::Tesseract::flip_hyphens ( WERD_RES word)

Definition at line 621 of file reject.cpp.

621  {
622  WERD_CHOICE *best_choice = word_res->best_choice;
623  int i;
624  int prev_right = -9999;
625  int next_left;
626  TBOX out_box;
627  float aspect_ratio;
628 
630  return;
631 
632  int num_blobs = word_res->rebuild_word->NumBlobs();
633  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
634  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
635  TBLOB* blob = word_res->rebuild_word->blobs[i];
636  out_box = blob->bounding_box();
637  if (i + 1 == num_blobs)
638  next_left = 9999;
639  else
640  next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
641  // Don't touch small or touching blobs - it is too dangerous.
642  if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
643  (out_box.left() > prev_right) && (out_box.right() < next_left)) {
644  aspect_ratio = out_box.width() / (float) out_box.height();
645  if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
646  if (aspect_ratio >= tessedit_upper_flip_hyphen &&
647  word_res->uch_set->contains_unichar_id(unichar_dash) &&
648  word_res->uch_set->get_enabled(unichar_dash)) {
649  /* Certain HYPHEN */
650  best_choice->set_unichar_id(unichar_dash, i);
651  if (word_res->reject_map[i].rejected())
652  word_res->reject_map[i].setrej_hyphen_accept();
653  }
654  if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
655  word_res->reject_map[i].accepted())
656  //Suspected HYPHEN
657  word_res->reject_map[i].setrej_hyphen ();
658  }
659  else if (best_choice->unichar_id(i) == unichar_dash) {
660  if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
661  (word_res->reject_map[i].rejected()))
662  word_res->reject_map[i].setrej_hyphen_accept();
663  //Certain HYPHEN
664 
665  if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
666  (word_res->reject_map[i].accepted()))
667  //Suspected HYPHEN
668  word_res->reject_map[i].setrej_hyphen();
669  }
670  }
671  prev_right = out_box.right();
672  }
673 }
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359
int UNICHAR_ID
Definition: unichar.h:35
Definition: rect.h:34
int16_t width() const
Definition: rect.h:115
int16_t left() const
Definition: rect.h:72
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
TBOX bounding_box() const
Definition: blobs.cpp:478
int length() const
Definition: ratngs.h:303
int16_t right() const
Definition: rect.h:79
Definition: blobs.h:268
int16_t height() const
Definition: rect.h:108

◆ font_recognition_pass()

void tesseract::Tesseract::font_recognition_pass ( PAGE_RES page_res)

font_recognition_pass

Smooth the fonts for the document.

Definition at line 2060 of file control.cpp.

2060  {
2061  PAGE_RES_IT page_res_it(page_res);
2062  WERD_RES *word; // current word
2063  STATS doc_fonts(0, font_table_size_); // font counters
2064 
2065  // Gather font id statistics.
2066  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2067  page_res_it.forward()) {
2068  word = page_res_it.word();
2069  if (word->fontinfo != nullptr) {
2070  doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2071  }
2072  if (word->fontinfo2 != nullptr) {
2073  doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2074  }
2075  }
2076  int16_t doc_font; // modal font
2077  int8_t doc_font_count; // modal font
2078  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2079  if (doc_font_count == 0)
2080  return;
2081  // Get the modal font pointer.
2082  const FontInfo* modal_font = nullptr;
2083  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2084  page_res_it.forward()) {
2085  word = page_res_it.word();
2086  if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2087  modal_font = word->fontinfo;
2088  break;
2089  }
2090  if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2091  modal_font = word->fontinfo2;
2092  break;
2093  }
2094  }
2095  ASSERT_HOST(modal_font != nullptr);
2096 
2097  // Assign modal font to weak words.
2098  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2099  page_res_it.forward()) {
2100  word = page_res_it.word();
2101  const int length = word->best_choice->length();
2102 
2103  const int count = word->fontinfo_id_count;
2104  if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2105  word->fontinfo = modal_font;
2106  // Counts only get 1 as it came from the doc.
2107  word->fontinfo_id_count = 1;
2108  word->italic = modal_font->is_italic() ? 1 : -1;
2109  word->bold = modal_font->is_bold() ? 1 : -1;
2110  }
2111  }
2112 }
int8_t italic
Definition: pageres.h:301
int count(LIST var_list)
Definition: oldlist.cpp:98
const FontInfo * fontinfo
Definition: pageres.h:304
Definition: statistc.h:33
bool is_bold() const
Definition: fontinfo.h:112
int8_t fontinfo_id2_count
Definition: pageres.h:307
bool is_italic() const
Definition: fontinfo.h:111
int8_t bold
Definition: pageres.h:302
int8_t fontinfo_id_count
Definition: pageres.h:306
int length() const
Definition: ratngs.h:303
int32_t universal_id
Definition: fontinfo.h:123
WERD_CHOICE * best_choice
Definition: pageres.h:235
#define ASSERT_HOST(x)
Definition: errcode.h:84
const FontInfo * fontinfo2
Definition: pageres.h:305
WERD * word
Definition: pageres.h:189

◆ fp_eval_word_spacing()

int16_t tesseract::Tesseract::fp_eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 860 of file fixspace.cpp.

860  {
861  WERD_RES_IT word_it(&word_res_list);
862  WERD_RES *word;
863  int16_t score = 0;
864  int16_t i;
865  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
866 
867  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
868  word = word_it.data();
869  if (word->rebuild_word == nullptr)
870  continue; // Can't handle cube words.
871  if (word->done ||
872  word->tess_accepted ||
873  word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
874  word->best_choice->permuter() == FREQ_DAWG_PERM ||
875  word->best_choice->permuter() == USER_DAWG_PERM ||
876  safe_dict_word(word) > 0) {
877  int num_blobs = word->rebuild_word->NumBlobs();
878  UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
879  for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
880  TBLOB* blob = word->rebuild_word->blobs[i];
881  if (word->best_choice->unichar_id(i) == space ||
882  blob_noise_score(blob) < small_limit) {
883  score -= 1; // penalise possibly erroneous non-space
884  } else if (word->reject_map[i].accepted()) {
885  score++;
886  }
887  }
888  }
889  }
890  if (score < 0)
891  score = 0;
892  return score;
893 }
TWERD * rebuild_word
Definition: pageres.h:260
int UNICHAR_ID
Definition: unichar.h:35
REJMAP reject_map
Definition: pageres.h:287
uint8_t permuter() const
Definition: ratngs.h:346
int NumBlobs() const
Definition: blobs.h:432
const int kBlnXHeight
Definition: normalis.h:24
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:790
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:612
bool tess_accepted
Definition: pageres.h:296
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
bool done
Definition: pageres.h:298
const UNICHARSET * uch_set
Definition: pageres.h:206
Definition: blobs.h:268
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ garbage_word()

GARBAGE_LEVEL tesseract::Tesseract::garbage_word ( WERD_RES word,
BOOL8  ok_dict_word 
)

Definition at line 680 of file docqual.cpp.

680  {
681  enum STATES
682  {
683  JUNK,
684  FIRST_UPPER,
685  FIRST_LOWER,
686  FIRST_NUM,
687  SUBSEQUENT_UPPER,
688  SUBSEQUENT_LOWER,
689  SUBSEQUENT_NUM
690  };
691  const char *str = word->best_choice->unichar_string().string();
692  const char *lengths = word->best_choice->unichar_lengths().string();
693  STATES state = JUNK;
694  int len = 0;
695  int isolated_digits = 0;
696  int isolated_alphas = 0;
697  int bad_char_count = 0;
698  int tess_rejs = 0;
699  int dodgy_chars = 0;
700  int ok_chars;
701  UNICHAR_ID last_char = -1;
702  int alpha_repetition_count = 0;
703  int longest_alpha_repetition_count = 0;
704  int longest_lower_run_len = 0;
705  int lower_string_count = 0;
706  int longest_upper_run_len = 0;
707  int upper_string_count = 0;
708  int total_alpha_count = 0;
709  int total_digit_count = 0;
710 
711  for (; *str != '\0'; str += *(lengths++)) {
712  len++;
713  if (word->uch_set->get_isupper (str, *lengths)) {
714  total_alpha_count++;
715  switch (state) {
716  case SUBSEQUENT_UPPER:
717  case FIRST_UPPER:
718  state = SUBSEQUENT_UPPER;
719  upper_string_count++;
720  if (longest_upper_run_len < upper_string_count)
721  longest_upper_run_len = upper_string_count;
722  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
723  alpha_repetition_count++;
724  if (longest_alpha_repetition_count < alpha_repetition_count) {
725  longest_alpha_repetition_count = alpha_repetition_count;
726  }
727  }
728  else {
729  last_char = word->uch_set->unichar_to_id(str, *lengths);
730  alpha_repetition_count = 1;
731  }
732  break;
733  case FIRST_NUM:
734  isolated_digits++;
735  default:
736  state = FIRST_UPPER;
737  last_char = word->uch_set->unichar_to_id(str, *lengths);
738  alpha_repetition_count = 1;
739  upper_string_count = 1;
740  break;
741  }
742  }
743  else if (word->uch_set->get_islower (str, *lengths)) {
744  total_alpha_count++;
745  switch (state) {
746  case SUBSEQUENT_LOWER:
747  case FIRST_LOWER:
748  state = SUBSEQUENT_LOWER;
749  lower_string_count++;
750  if (longest_lower_run_len < lower_string_count)
751  longest_lower_run_len = lower_string_count;
752  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
753  alpha_repetition_count++;
754  if (longest_alpha_repetition_count < alpha_repetition_count) {
755  longest_alpha_repetition_count = alpha_repetition_count;
756  }
757  }
758  else {
759  last_char = word->uch_set->unichar_to_id(str, *lengths);
760  alpha_repetition_count = 1;
761  }
762  break;
763  case FIRST_NUM:
764  isolated_digits++;
765  default:
766  state = FIRST_LOWER;
767  last_char = word->uch_set->unichar_to_id(str, *lengths);
768  alpha_repetition_count = 1;
769  lower_string_count = 1;
770  break;
771  }
772  }
773  else if (word->uch_set->get_isdigit (str, *lengths)) {
774  total_digit_count++;
775  switch (state) {
776  case FIRST_NUM:
777  state = SUBSEQUENT_NUM;
778  case SUBSEQUENT_NUM:
779  break;
780  case FIRST_UPPER:
781  case FIRST_LOWER:
782  isolated_alphas++;
783  default:
784  state = FIRST_NUM;
785  break;
786  }
787  }
788  else {
789  if (*lengths == 1 && *str == ' ')
790  tess_rejs++;
791  else
792  bad_char_count++;
793  switch (state) {
794  case FIRST_NUM:
795  isolated_digits++;
796  break;
797  case FIRST_UPPER:
798  case FIRST_LOWER:
799  isolated_alphas++;
800  default:
801  break;
802  }
803  state = JUNK;
804  }
805  }
806 
807  switch (state) {
808  case FIRST_NUM:
809  isolated_digits++;
810  break;
811  case FIRST_UPPER:
812  case FIRST_LOWER:
813  isolated_alphas++;
814  default:
815  break;
816  }
817 
819  total_alpha_count += total_digit_count - isolated_digits;
820  }
821 
822  if (crunch_leave_ok_strings && len >= 4 &&
823  2 * (total_alpha_count - isolated_alphas) > len &&
824  longest_alpha_repetition_count < crunch_long_repetitions) {
825  if ((crunch_accept_ok &&
826  acceptable_word_string(*word->uch_set, str, lengths) !=
827  AC_UNACCEPTABLE) ||
828  longest_lower_run_len > crunch_leave_lc_strings ||
829  longest_upper_run_len > crunch_leave_uc_strings)
830  return G_NEVER_CRUNCH;
831  }
832  if (word->reject_map.length() > 1 &&
833  strpbrk(str, " ") == nullptr &&
834  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
835  word->best_choice->permuter() == FREQ_DAWG_PERM ||
836  word->best_choice->permuter() == USER_DAWG_PERM ||
837  word->best_choice->permuter() == NUMBER_PERM ||
838  acceptable_word_string(*word->uch_set, str, lengths) !=
839  AC_UNACCEPTABLE || ok_dict_word))
840  return G_OK;
841 
842  ok_chars = len - bad_char_count - isolated_digits -
843  isolated_alphas - tess_rejs;
844 
845  if (crunch_debug > 3) {
846  tprintf("garbage_word: \"%s\"\n",
847  word->best_choice->unichar_string().string());
848  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
849  len,
850  bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
851  }
852  if (bad_char_count == 0 &&
853  tess_rejs == 0 &&
854  (len > isolated_digits + isolated_alphas || len <= 2))
855  return G_OK;
856 
857  if (tess_rejs > ok_chars ||
858  (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
859  return G_TERRIBLE;
860 
861  if (len > 4) {
862  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
863  isolated_alphas;
864  if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
865  return G_DODGY;
866  else
867  return G_OK;
868  } else {
869  dodgy_chars = 2 * tess_rejs + bad_char_count;
870  if ((len == 4 && dodgy_chars > 2) ||
871  (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
872  return G_DODGY;
873  else
874  return G_OK;
875  }
876 }
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493
int UNICHAR_ID
Definition: unichar.h:35
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
uint8_t permuter() const
Definition: ratngs.h:346
int32_t length() const
Definition: rejctmap.h:223
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
const STRING & unichar_lengths() const
Definition: ratngs.h:548
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
Definition: docqual.h:32
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1764
const UNICHARSET * uch_set
Definition: pageres.h:206
const STRING & unichar_string() const
Definition: ratngs.h:541
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500
Unacceptable word.
Definition: control.h:30
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ get_rep_char()

UNICHAR_ID tesseract::Tesseract::get_rep_char ( WERD_RES word)

Definition at line 258 of file output.cpp.

258  { // what char is repeated?
259  int i;
260  for (i = 0; ((i < word->reject_map.length()) &&
261  (word->reject_map[i].rejected())); ++i);
262 
263  if (i < word->reject_map.length()) {
264  return word->best_choice->unichar_id(i);
265  } else {
266  return word->uch_set->unichar_to_id(unrecognised_char.string());
267  }
268 }
REJMAP reject_map
Definition: pageres.h:287
int32_t length() const
Definition: rejctmap.h:223
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
const UNICHARSET * uch_set
Definition: pageres.h:206
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ get_sub_lang()

Tesseract* tesseract::Tesseract::get_sub_lang ( int  index) const
inline

Definition at line 276 of file tesseractclass.h.

276  {
277  return sub_langs_[index];
278  }

◆ getDict()

Dict & tesseract::Tesseract::getDict ( )
overridevirtual

Reimplemented from tesseract::Classify.

Definition at line 556 of file tesseractclass.cpp.

557 {
558  if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang())
559  {
560  if (lstm_recognizer_ && lstm_recognizer_->GetDict())
561  {
562  return *const_cast<Dict*>(lstm_recognizer_->GetDict());
563  }
564  }
565  return Classify::getDict();
566  }
const Dict * GetDict() const
virtual Dict & getDict()
Definition: classify.h:107
bool AnyLSTMLang() const

◆ GetLineData()

ImageData * tesseract::Tesseract::GetLineData ( const TBOX line_box,
const GenericVector< TBOX > &  boxes,
const GenericVector< STRING > &  texts,
int  start_box,
int  end_box,
const BLOCK block 
)

Definition at line 129 of file linerec.cpp.

133  {
134  TBOX revised_box;
135  ImageData* image_data = GetRectImage(line_box, block, kImagePadding,
136  &revised_box);
137  if (image_data == nullptr) return nullptr;
138  image_data->set_page_number(applybox_page);
139  // Copy the boxes and shift them so they are relative to the image.
140  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
141  ICOORD shift = -revised_box.botleft();
142  GenericVector<TBOX> line_boxes;
143  GenericVector<STRING> line_texts;
144  for (int b = start_box; b < end_box; ++b) {
145  TBOX box = boxes[b];
146  box.rotate(block_rotation);
147  box.move(shift);
148  line_boxes.push_back(box);
149  line_texts.push_back(texts[b]);
150  }
151  GenericVector<int> page_numbers;
152  page_numbers.init_to_size(line_boxes.size(), applybox_page);
153  image_data->AddBoxes(line_boxes, line_texts, page_numbers);
154  return image_data;
155 }
int size() const
Definition: genericvector.h:71
void rotate(const FCOORD &vec)
Definition: rect.h:197
FCOORD re_rotation() const
Definition: ocrblock.h:136
Definition: rect.h:34
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:163
void move(const ICOORD vec)
Definition: rect.h:157
const int kImagePadding
Definition: imagedata.h:39
integer coordinate
Definition: points.h:32
void init_to_size(int size, const T &t)
int push_back(T object)
const ICOORD & botleft() const
Definition: rect.h:92
Definition: points.h:189
float x() const
Definition: points.h:208
float y() const
Definition: points.h:211

◆ GetRectImage()

ImageData * tesseract::Tesseract::GetRectImage ( const TBOX box,
const BLOCK block,
int  padding,
TBOX revised_box 
) const

Definition at line 163 of file linerec.cpp.

164  {
165  TBOX wbox = box;
166  wbox.pad(padding, padding);
167  *revised_box = wbox;
168  // Number of clockwise 90 degree rotations needed to get back to tesseract
169  // coords from the clipped image.
170  int num_rotations = 0;
171  if (block.re_rotation().y() > 0.0f)
172  num_rotations = 1;
173  else if (block.re_rotation().x() < 0.0f)
174  num_rotations = 2;
175  else if (block.re_rotation().y() < 0.0f)
176  num_rotations = 3;
177  // Handle two cases automatically: 1 the box came from the block, 2 the box
178  // came from a box file, and refers to the image, which the block may not.
179  if (block.pdblk.bounding_box().major_overlap(*revised_box))
180  revised_box->rotate(block.re_rotation());
181  // Now revised_box always refers to the image.
182  // BestPix is never colormapped, but may be of any depth.
183  Pix* pix = BestPix();
184  int width = pixGetWidth(pix);
185  int height = pixGetHeight(pix);
186  TBOX image_box(0, 0, width, height);
187  // Clip to image bounds;
188  *revised_box &= image_box;
189  if (revised_box->null_box()) return nullptr;
190  Box* clip_box = boxCreate(revised_box->left(), height - revised_box->top(),
191  revised_box->width(), revised_box->height());
192  Pix* box_pix = pixClipRectangle(pix, clip_box, nullptr);
193  if (box_pix == nullptr) return nullptr;
194  boxDestroy(&clip_box);
195  if (num_rotations > 0) {
196  Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
197  pixDestroy(&box_pix);
198  box_pix = rot_pix;
199  }
200  // Convert sub-8-bit images to 8 bit.
201  int depth = pixGetDepth(box_pix);
202  if (depth < 8) {
203  Pix* grey;
204  grey = pixConvertTo8(box_pix, false);
205  pixDestroy(&box_pix);
206  box_pix = grey;
207  }
208  bool vertical_text = false;
209  if (num_rotations > 0) {
210  // Rotated the clipped revised box back to internal coordinates.
211  FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
212  revised_box->rotate(rotation);
213  if (num_rotations != 2)
214  vertical_text = true;
215  }
216  return new ImageData(vertical_text, box_pix);
217 }
void rotate(const FCOORD &vec)
Definition: rect.h:197
FCOORD re_rotation() const
Definition: ocrblock.h:136
bool null_box() const
Definition: rect.h:50
Definition: rect.h:34
Pix * BestPix() const
int16_t width() const
Definition: rect.h:115
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:60
Definition: points.h:189
float x() const
Definition: points.h:208
PDBLK pdblk
Definition: ocrblock.h:192
int16_t height() const
Definition: rect.h:108
void pad(int xpad, int ypad)
Definition: rect.h:131
float y() const
Definition: points.h:211

◆ GetSubAndSuperscriptCandidates()

void tesseract::Tesseract::GetSubAndSuperscriptCandidates ( const WERD_RES word,
int *  num_rebuilt_leading,
ScriptPos leading_pos,
float *  leading_certainty,
int *  num_rebuilt_trailing,
ScriptPos trailing_pos,
float *  trailing_certainty,
float *  avg_certainty,
float *  unlikely_threshold 
)

Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.

Parameters
[in]wordThe word to examine.
[out]num_rebuilt_leadingthe number of rebuilt blobs at the start of the word which are all up or down and seem badly classified.
[out]leading_pos"super" or "sub" (for debugging)
[out]leading_certaintythe worst certainty in the leading blobs.
[out]num_rebuilt_trailingthe number of rebuilt blobs at the end of the word which are all up or down and seem badly classified.
[out]trailing_pos"super" or "sub" (for debugging)
[out]trailing_certaintythe worst certainty in the trailing blobs.
[out]avg_certaintythe average certainty of "normal" blobs in the word.
[out]unlikely_thresholdthe threshold (on certainty) we used to select "bad enough" outlier characters.

Definition at line 254 of file superscript.cpp.

262  {
263  *avg_certainty = *unlikely_threshold = 0.0f;
264  *num_rebuilt_leading = *num_rebuilt_trailing = 0;
265  *leading_certainty = *trailing_certainty = 0.0f;
266 
267  int super_y_bottom =
269  int sub_y_top =
271 
272  // Step one: Get an average certainty for "normally placed" characters.
273 
274  // Counts here are of blobs in the rebuild_word / unichars in best_choice.
275  *leading_pos = *trailing_pos = SP_NORMAL;
276  int leading_outliers = 0;
277  int trailing_outliers = 0;
278  int num_normal = 0;
279  float normal_certainty_total = 0.0f;
280  float worst_normal_certainty = 0.0f;
281  ScriptPos last_pos = SP_NORMAL;
282  int num_blobs = word->rebuild_word->NumBlobs();
283  for (int b = 0; b < num_blobs; ++b) {
284  TBOX box = word->rebuild_word->blobs[b]->bounding_box();
285  ScriptPos pos = SP_NORMAL;
286  if (box.bottom() >= super_y_bottom) {
287  pos = SP_SUPERSCRIPT;
288  } else if (box.top() <= sub_y_top) {
289  pos = SP_SUBSCRIPT;
290  }
291  if (pos == SP_NORMAL) {
292  if (word->best_choice->unichar_id(b) != 0) {
293  float char_certainty = word->best_choice->certainty(b);
294  if (char_certainty < worst_normal_certainty) {
295  worst_normal_certainty = char_certainty;
296  }
297  num_normal++;
298  normal_certainty_total += char_certainty;
299  }
300  if (trailing_outliers == b) {
301  leading_outliers = trailing_outliers;
302  *leading_pos = last_pos;
303  }
304  trailing_outliers = 0;
305  } else {
306  if (last_pos == pos) {
307  trailing_outliers++;
308  } else {
309  trailing_outliers = 1;
310  }
311  }
312  last_pos = pos;
313  }
314  *trailing_pos = last_pos;
315  if (num_normal >= 3) { // throw out the worst as an outlier.
316  num_normal--;
317  normal_certainty_total -= worst_normal_certainty;
318  }
319  if (num_normal > 0) {
320  *avg_certainty = normal_certainty_total / num_normal;
321  *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
322  }
323  if (num_normal == 0 ||
324  (leading_outliers == 0 && trailing_outliers == 0)) {
325  return;
326  }
327 
328  // Step two: Try to split off bits of the word that are both outliers
329  // and have much lower certainty than average
330  // Calculate num_leading and leading_certainty.
331  for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
332  *num_rebuilt_leading < leading_outliers;
333  (*num_rebuilt_leading)++) {
334  float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
335  if (char_certainty > *unlikely_threshold) {
336  break;
337  }
338  if (char_certainty < *leading_certainty) {
339  *leading_certainty = char_certainty;
340  }
341  }
342 
343  // Calculate num_trailing and trailing_certainty.
344  for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
345  *num_rebuilt_trailing < trailing_outliers;
346  (*num_rebuilt_trailing)++) {
347  int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
348  float char_certainty = word->best_choice->certainty(blob_idx);
349  if (char_certainty > *unlikely_threshold) {
350  break;
351  }
352  if (char_certainty < *trailing_certainty) {
353  *trailing_certainty = char_certainty;
354  }
355  }
356 }
TWERD * rebuild_word
Definition: pageres.h:260
Definition: rect.h:34
int NumBlobs() const
Definition: blobs.h:432
const int kBlnXHeight
Definition: normalis.h:24
float certainty() const
Definition: ratngs.h:330
const int kBlnBaselineOffset
Definition: normalis.h:25
int16_t top() const
Definition: rect.h:58
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
int16_t bottom() const
Definition: rect.h:65
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ ImageHeight()

int tesseract::Tesseract::ImageHeight ( ) const
inline

Definition at line 250 of file tesseractclass.h.

250  {
251  return pixGetHeight(pix_binary_);
252  }

◆ ImageWidth()

int tesseract::Tesseract::ImageWidth ( ) const
inline

Definition at line 247 of file tesseractclass.h.

247  {
248  return pixGetWidth(pix_binary_);
249  }

◆ init_recog_training()

FILE * tesseract::Tesseract::init_recog_training ( const STRING fname)

Definition at line 35 of file recogtraining.cpp.

35  {
37  tessedit_tess_adaption_mode.set_value(0); // turn off adaption
38  tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
39  // Explore all segmentations.
41  }
42 
43  STRING output_fname = fname;
44  const char *lastdot = strrchr(output_fname.string(), '.');
45  if (lastdot != nullptr) output_fname[lastdot - output_fname.string()] = '\0';
46  output_fname += ".txt";
47  FILE *output_file = fopen(output_fname.string(), "a+");
48  if (output_file == nullptr) {
49  tprintf("Error: Could not open file %s\n", output_fname.string());
50  ASSERT_HOST(output_file);
51  }
52  return output_file;
53 }
Dict & getDict() override
const char * string() const
Definition: strngs.cpp:196
bool stopper_no_acceptable_choices
Definition: dict.h:625
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: strngs.h:45
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ init_tesseract() [1/2]

int tesseract::Tesseract::init_tesseract ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 296 of file tessedit.cpp.

302  {
303  GenericVector<STRING> langs_to_load;
304  GenericVector<STRING> langs_not_to_load;
305  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
306 
307  sub_langs_.delete_data_pointers();
308  sub_langs_.clear();
309  // Find the first loadable lang and load into this.
310  // Add any languages that this language requires
311  bool loaded_primary = false;
312  // Load the rest into sub_langs_.
313  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
314  if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
315  const char *lang_str = langs_to_load[lang_index].string();
316  Tesseract *tess_to_init;
317  if (!loaded_primary) {
318  tess_to_init = this;
319  } else {
320  tess_to_init = new Tesseract;
321  }
322 
323  int result = tess_to_init->init_tesseract_internal(
324  arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
325  vars_values, set_only_non_debug_params, mgr);
326  // Forget that language, but keep any reader we were given.
327  mgr->Clear();
328 
329  if (!loaded_primary) {
330  if (result < 0) {
331  tprintf("Failed loading language '%s'\n", lang_str);
332  } else {
333  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
334  &langs_to_load, &langs_not_to_load);
335  loaded_primary = true;
336  }
337  } else {
338  if (result < 0) {
339  tprintf("Failed loading language '%s'\n", lang_str);
340  delete tess_to_init;
341  } else {
342  sub_langs_.push_back(tess_to_init);
343  // Add any languages that this language requires
344  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
345  &langs_to_load, &langs_not_to_load);
346  }
347  }
348  }
349  }
350  if (!loaded_primary) {
351  tprintf("Tesseract couldn't load any languages!\n");
352  return -1; // Couldn't load any language!
353  }
354 #ifndef DISABLED_LEGACY_ENGINE
355  if (!sub_langs_.empty()) {
356  // In multilingual mode word ratings have to be directly comparable,
357  // so use the same language model weights for all languages:
358  // use the primary language's params model if
359  // tessedit_use_primary_params_model is set,
360  // otherwise use default language model weights.
362  for (int s = 0; s < sub_langs_.size(); ++s) {
363  sub_langs_[s]->language_model_->getParamsModel().Copy(
364  this->language_model_->getParamsModel());
365  }
366  tprintf("Using params model of the primary language\n");
367  } else {
368  this->language_model_->getParamsModel().Clear();
369  for (int s = 0; s < sub_langs_.size(); ++s) {
370  sub_langs_[s]->language_model_->getParamsModel().Clear();
371  }
372  }
373  }
374 
376 #endif // ndef DISABLED_LEGACY_ENGINE
377  return 0;
378 }
int size() const
Definition: genericvector.h:71
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
void SetupUniversalFontIds()
Definition: tessedit.cpp:441
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:262
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

◆ init_tesseract() [2/2]

int tesseract::Tesseract::init_tesseract ( const char *  datapath,
const char *  language,
OcrEngineMode  oem 
)
inline

Definition at line 524 of file tesseractclass.h.

526  {
527  TessdataManager mgr;
528  return init_tesseract(datapath, nullptr, language, oem, nullptr, 0, nullptr, nullptr,
529  false, &mgr);
530  }
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:296

◆ init_tesseract_internal()

int tesseract::Tesseract::init_tesseract_internal ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 396 of file tessedit.cpp.

402  {
403  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
404  configs_size, vars_vec, vars_values,
405  set_only_non_debug_params, mgr)) {
406  return -1;
407  }
409  return 0;
410  }
411  // If only LSTM will be used, skip loading Tesseract classifier's
412  // pre-trained templates and dictionary.
414  program_editup(textbase, init_tesseract ? mgr : nullptr,
415  init_tesseract ? mgr : nullptr);
416  return 0; //Normal exit
417 }
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:91
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:296
void program_editup(const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
Definition: tface.cpp:40

◆ init_tesseract_lang_data()

bool tesseract::Tesseract::init_tesseract_lang_data ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 91 of file tessedit.cpp.

96  {
97  // Set the basename, compute the data directory.
98  main_setup(arg0, textbase);
99 
100  // Set the language data path prefix
101  lang = language != nullptr ? language : "eng";
105 
106  // Initialize TessdataManager.
107  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
108  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) {
109  tprintf("Error opening data file %s\n", tessdata_path.string());
110  tprintf("Please make sure the TESSDATA_PREFIX environment variable is set"
111  " to your \"tessdata\" directory.\n");
112  return false;
113  }
114 #ifndef DISABLED_LEGACY_ENGINE
115  if (oem == OEM_DEFAULT) {
116  // Set the engine mode from availability, which can then be overridden by
117  // the config file when we read it below.
118  if (!mgr->IsLSTMAvailable()) {
120  } else if (!mgr->IsBaseAvailable()) {
122  } else {
124  }
125  }
126 #endif // ndef DISABLED_LEGACY_ENGINE
127 
128  // If a language specific config file (lang.config) exists, load it in.
129  TFile fp;
130  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
132  this->params());
133  }
134 
135  SetParamConstraint set_params_constraint = set_only_non_debug_params ?
137  // Load tesseract variables from config files. This is done after loading
138  // language-specific variables from [lang].traineddata file, so that custom
139  // config files can override values in [lang].traineddata file.
140  for (int i = 0; i < configs_size; ++i) {
141  read_config_file(configs[i], set_params_constraint);
142  }
143 
144  // Set params specified in vars_vec (done after setting params from config
145  // files, so that params in vars_vec can override those from files).
146  if (vars_vec != nullptr && vars_values != nullptr) {
147  for (int i = 0; i < vars_vec->size(); ++i) {
148  if (!ParamUtils::SetParam((*vars_vec)[i].string(),
149  (*vars_values)[i].string(),
150  set_params_constraint, this->params())) {
151  tprintf("Error setting param %s\n", (*vars_vec)[i].string());
152  exit(1);
153  }
154  }
155  }
156 
157  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
158  FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
159  if (params_file != nullptr) {
160  ParamUtils::PrintParams(params_file, this->params());
161  fclose(params_file);
162  } else {
163  tprintf("Failed to open %s for writing params.\n",
165  }
166  }
167 
168  // Determine which ocr engine(s) should be loaded and used for recognition.
169  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
170 
171  // If we are only loading the config file (and so not planning on doing any
172  // recognition) then there's nothing else do here.
174  return true;
175  }
176 
177 // The various OcrEngineMode settings (see publictypes.h) determine which
178 // engine-specific data files need to be loaded.
179 // If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
180 #ifndef ANDROID_BUILD
181 #ifdef DISABLED_LEGACY_ENGINE
183 #else
186 #endif // ndef DISABLED_LEGACY_ENGINE
187  if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
188  lstm_recognizer_ = new LSTMRecognizer;
189  ASSERT_HOST(
190  lstm_recognizer_->Load(lstm_use_matrix ? language : nullptr, mgr));
191  } else {
192  tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
194  }
195  }
196 #endif // ndef ANDROID_BUILD
197 
198  // Load the unicharset
200  // Avoid requiring a unicharset when we aren't running base tesseract.
201 #ifndef ANDROID_BUILD
202  unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
203 #endif // ndef ANDROID_BUILD
204  }
205 #ifndef DISABLED_LEGACY_ENGINE
206  else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
207  !unicharset.load_from_file(&fp, false)) {
208  return false;
209  }
210 #endif // ndef DISABLED_LEGACY_ENGINE
211  if (unicharset.size() > MAX_NUM_CLASSES) {
212  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
213  return false;
214  }
215  right_to_left_ = unicharset.major_right_to_left();
216 
217  // Setup initial unichar ambigs table and read universal ambigs.
218  UNICHARSET encoder_unicharset;
219  encoder_unicharset.CopyFrom(unicharset);
221  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
222 
223  if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
224  unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
227  }
228 #ifndef DISABLED_LEGACY_ENGINE
229  // Init ParamsModel.
230  // Load pass1 and pass2 weights (for now these two sets are the same, but in
231  // the future separate sets of weights can be generated).
232  for (int p = ParamsModel::PTRAIN_PASS1;
234  language_model_->getParamsModel().SetPass(
235  static_cast<ParamsModel::PassEnum>(p));
236  if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
237  if (!language_model_->getParamsModel().LoadFromFp(lang.string(), &fp)) {
238  return false;
239  }
240  }
241  }
242 #endif // ndef DISABLED_LEGACY_ENGINE
243 
244  return true;
245 }
const UNICHARSET & GetUnicharset() const
int size() const
Definition: genericvector.h:71
char * tessedit_write_params_to_file
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:171
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:447
const char * string() const
Definition: strngs.cpp:196
bool use_ambigs_for_adaption
Definition: ccutil.h:88
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:89
STRING language_data_path_prefix
Definition: ccutil.h:67
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:63
int size() const
Definition: unicharset.h:336
bool major_right_to_left() const
Definition: unicharset.cpp:962
int ambigs_debug_level
Definition: ccutil.h:84
SetParamConstraint
Definition: params.h:36
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
UNICHARSET unicharset
Definition: ccutil.h:68
STRING lang
Definition: ccutil.h:66
bool Load(const char *lang, TessdataManager *mgr)
ParamsVectors * params()
Definition: ccutil.h:62
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:60
Definition: strngs.h:45
STRING datadir
Definition: ccutil.h:64
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:49
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:60
void main_setup(const char *argv0, const char *basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:48
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:70
#define MAX_NUM_CLASSES
Definition: matchdefs.h:32
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:383
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:69
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ init_tesseract_lm()

int tesseract::Tesseract::init_tesseract_lm ( const char *  arg0,
const char *  textbase,
const char *  language,
TessdataManager mgr 
)

Definition at line 462 of file tessedit.cpp.

463  {
464  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
465  nullptr, 0, nullptr, nullptr, false, mgr))
466  return -1;
468  getDict().Load(lang, mgr);
469  getDict().FinishLoad();
470  return 0;
471 }
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:193
Dict & getDict() override
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:91
STRING lang
Definition: ccutil.h:66
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:219
bool FinishLoad()
Definition: dict.cpp:323
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:201

◆ join_words()

void tesseract::Tesseract::join_words ( WERD_RES word,
WERD_RES word2,
BlamerBundle orig_bb 
) const

Definition at line 234 of file tfacepp.cpp.

236  {
237  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
238  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
239  // Tack the word2 outputs onto the end of the word outputs.
240  word->chopped_word->blobs += word2->chopped_word->blobs;
241  word->rebuild_word->blobs += word2->rebuild_word->blobs;
242  word2->chopped_word->blobs.clear();
243  word2->rebuild_word->blobs.clear();
244  TPOINT split_pt;
245  split_pt.x = (prev_box.right() + blob_box.left()) / 2;
246  split_pt.y = (prev_box.top() + prev_box.bottom() +
247  blob_box.top() + blob_box.bottom()) / 4;
248  // Move the word2 seams onto the end of the word1 seam_array.
249  // Since the seam list is one element short, an empty seam marking the
250  // end of the last blob in the first word is needed first.
251  word->seam_array.push_back(new SEAM(0.0f, split_pt));
252  word->seam_array += word2->seam_array;
253  word2->seam_array.truncate(0);
254  // Fix widths and gaps.
255  word->blob_widths += word2->blob_widths;
256  word->blob_gaps += word2->blob_gaps;
257  // Fix the ratings matrix.
258  int rat1 = word->ratings->dimension();
259  int rat2 = word2->ratings->dimension();
260  word->ratings->AttachOnCorner(word2->ratings);
261  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
262  word->best_state += word2->best_state;
263  // Append the word choices.
264  *word->raw_choice += *word2->raw_choice;
265 
266  // How many alt choices from each should we try to get?
267  const int kAltsPerPiece = 2;
268  // When do we start throwing away extra alt choices?
269  const int kTooManyAltChoices = 100;
270 
271  // Construct the cartesian product of the best_choices of word(1) and word2.
272  WERD_CHOICE_LIST joined_choices;
273  WERD_CHOICE_IT jc_it(&joined_choices);
274  WERD_CHOICE_IT bc1_it(&word->best_choices);
275  WERD_CHOICE_IT bc2_it(&word2->best_choices);
276  int num_word1_choices = word->best_choices.length();
277  int total_joined_choices = num_word1_choices;
278  // Nota Bene: For the main loop here, we operate only on the 2nd and greater
279  // word2 choices, and put them in the joined_choices list. The 1st word2
280  // choice gets added to the original word1 choices in-place after we have
281  // finished with them.
282  int bc2_index = 1;
283  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
284  if (total_joined_choices >= kTooManyAltChoices &&
285  bc2_index > kAltsPerPiece)
286  break;
287  int bc1_index = 0;
288  for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
289  ++bc1_index, bc1_it.forward()) {
290  if (total_joined_choices >= kTooManyAltChoices &&
291  bc1_index > kAltsPerPiece)
292  break;
293  WERD_CHOICE *wc = new WERD_CHOICE(*bc1_it.data());
294  *wc += *bc2_it.data();
295  jc_it.add_after_then_move(wc);
296  ++total_joined_choices;
297  }
298  }
299  // Now that we've filled in as many alternates as we want, paste the best
300  // choice for word2 onto the original word alt_choices.
301  bc1_it.move_to_first();
302  bc2_it.move_to_first();
303  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
304  *bc1_it.data() += *bc2_it.data();
305  }
306  bc1_it.move_to_last();
307  bc1_it.add_list_after(&joined_choices);
308 
309  // Restore the pointer to original blamer bundle and combine blamer
310  // information recorded in the splits.
311  if (orig_bb != nullptr) {
312  orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
314  delete word->blamer_bundle;
315  word->blamer_bundle = orig_bb;
316  }
317  word->SetupBoxWord();
318  word->reject_map.initialise(word->box_word->length());
319  delete word2;
320 }
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
TWERD * rebuild_word
Definition: pageres.h:260
GenericVector< int > blob_widths
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:287
Definition: seam.h:44
Definition: rect.h:34
T & back() const
void SetupBoxWord()
Definition: pageres.cpp:855
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:230
GenericVector< int > best_state
Definition: pageres.h:271
int dimension() const
Definition: matrix.h:533
TBOX bounding_box() const
Definition: blobs.cpp:478
GenericVector< SEAM * > seam_array
Definition: pageres.h:217
GenericVector< int > blob_gaps
Definition: pageres.h:222
int push_back(T object)
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
int16_t x
Definition: blobs.h:78
MATRIX * ratings
Definition: pageres.h:231
int length() const
Definition: boxword.h:83
BlamerBundle * blamer_bundle
Definition: pageres.h:246
int16_t right() const
Definition: rect.h:79
void truncate(int size)
WERD_CHOICE * raw_choice
Definition: pageres.h:240
bool wordrec_debug_blamer
Definition: wordrec.h:236
TWERD * chopped_word
Definition: pageres.h:215
Definition: blobs.h:57
int16_t y
Definition: blobs.h:79
int16_t bottom() const
Definition: rect.h:65
void AttachOnCorner(BandTriMatrix< T > *array2)
Definition: matrix.h:550
tesseract::BoxWord * box_word
Definition: pageres.h:266
void initialise(int16_t length)
Definition: rejctmap.cpp:275
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ LSTMRecognizeWord()

void tesseract::Tesseract::LSTMRecognizeWord ( const BLOCK block,
ROW row,
WERD_RES word,
PointerVector< WERD_RES > *  words 
)

Definition at line 222 of file linerec.cpp.

223  {
224  TBOX word_box = word->word->bounding_box();
225  // Get the word image - no frills.
228  // In single word mode, use the whole image without any other row/word
229  // interpretation.
230  word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
231  } else {
232  float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
233  if (baseline + row->descenders() < word_box.bottom())
234  word_box.set_bottom(baseline + row->descenders());
235  if (baseline + row->x_height() + row->ascenders() > word_box.top())
236  word_box.set_top(baseline + row->x_height() + row->ascenders());
237  }
238  ImageData* im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
239  if (im_data == nullptr) return;
240  lstm_recognizer_->RecognizeLine(*im_data, true, classify_debug_level > 0,
242  word_box, words, lstm_choice_mode);
243  delete im_data;
244  SearchWords(words);
245 }
void set_top(int y)
Definition: rect.h:61
void set_bottom(int y)
Definition: rect.h:68
TBOX bounding_box() const
Definition: werd.cpp:159
float base_line(float xpos) const
Definition: ocrrow.h:59
Definition: rect.h:34
const float kCertaintyScale
Definition: linerec.cpp:36
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:163
const int kImagePadding
Definition: imagedata.h:39
void SearchWords(PointerVector< WERD_RES > *words)
Definition: linerec.cpp:250
Treat the image as a single word.
Definition: publictypes.h:174
int16_t left() const
Definition: rect.h:72
float ascenders() const
Definition: ocrrow.h:82
int16_t top() const
Definition: rect.h:58
float x_height() const
Definition: ocrrow.h:64
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert, const TBOX &line_box, PointerVector< WERD_RES > *words, int lstm_choice_mode=0)
float descenders() const
Definition: ocrrow.h:85
const float kWorstDictCertainty
Definition: linerec.cpp:38
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
WERD * word
Definition: pageres.h:189

◆ make_reject_map()

void tesseract::Tesseract::make_reject_map ( WERD_RES word,
ROW row,
int16_t  pass 
)

◆ match_current_words()

void tesseract::Tesseract::match_current_words ( WERD_RES_LIST &  words,
ROW row,
BLOCK block 
)

Definition at line 226 of file fixspace.cpp.

227  {
228  WERD_RES_IT word_it(&words);
229  WERD_RES *word;
230  // Since we are not using PAGE_RES to iterate over words, we need to update
231  // prev_word_best_choice_ before calling classify_word_pass2().
232  prev_word_best_choice_ = nullptr;
233  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
234  word = word_it.data();
235  if ((!word->part_of_combo) && (word->box_word == nullptr)) {
236  WordData word_data(block, row, word);
237  SetupWordPassN(2, &word_data);
238  classify_word_and_language(2, nullptr, &word_data);
239  }
241  }
242 }
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:182
bool part_of_combo
Definition: pageres.h:335
WERD_CHOICE * best_choice
Definition: pageres.h:235
tesseract::BoxWord * box_word
Definition: pageres.h:266
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1338

◆ match_word_pass_n()

void tesseract::Tesseract::match_word_pass_n ( int  pass_n,
WERD_RES word,
ROW row,
BLOCK block 
)

match_word_pass2

Baseline normalize the word and pass it to Tess.

Definition at line 1649 of file control.cpp.

1650  {
1651  if (word->tess_failed) return;
1652  tess_segment_pass_n(pass_n, word);
1653 
1654  if (!word->tess_failed) {
1655  if (!word->word->flag (W_REP_CHAR)) {
1656  word->fix_quotes();
1658  word->fix_hyphens();
1659  /* Don't trust fix_quotes! - though I think I've fixed the bug */
1660  if (word->best_choice->length() != word->box_word->length()) {
1661  tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1662  " #Blobs=%d\n",
1663  word->best_choice->debug_string().string(),
1664  word->best_choice->length(),
1665  word->box_word->length());
1666 
1667  }
1668  word->tess_accepted = tess_acceptable_word(word);
1669 
1670  // Also sets word->done flag
1671  make_reject_map(word, row, pass_n);
1672  }
1673  }
1674  set_word_fonts(word);
1675 
1676  ASSERT_HOST(word->raw_choice != nullptr);
1677 }
bool tess_failed
Definition: pageres.h:288
const char * string() const
Definition: strngs.cpp:196
void fix_hyphens()
Definition: pageres.cpp:1053
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1981
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
void fix_quotes()
Definition: pageres.cpp:1024
bool tess_accepted
Definition: pageres.h:296
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:62
const STRING debug_string() const
Definition: ratngs.h:505
int length() const
Definition: boxword.h:83
WERD_CHOICE * raw_choice
Definition: pageres.h:240
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:32
WERD_CHOICE * best_choice
Definition: pageres.h:235
tesseract::BoxWord * box_word
Definition: pageres.h:266
#define ASSERT_HOST(x)
Definition: errcode.h:84
WERD * word
Definition: pageres.h:189

◆ MaximallyChopWord()

void tesseract::Tesseract::MaximallyChopWord ( const GenericVector< TBOX > &  boxes,
BLOCK block,
ROW row,
WERD_RES word_res 
)

◆ mutable_pix_binary()

Pix** tesseract::Tesseract::mutable_pix_binary ( )
inline

Definition at line 197 of file tesseractclass.h.

197  {
198  pixDestroy(&pix_binary_);
199  return &pix_binary_;
200  }

◆ mutable_textord()

Textord* tesseract::Tesseract::mutable_textord ( )
inline

Definition at line 266 of file tesseractclass.h.

266  {
267  return &textord_;
268  }

◆ nn_match_word()

void tesseract::Tesseract::nn_match_word ( WERD_RES word,
ROW row 
)

◆ nn_recover_rejects()

void tesseract::Tesseract::nn_recover_rejects ( WERD_RES word,
ROW row 
)

◆ noise_outlines()

bool tesseract::Tesseract::noise_outlines ( TWERD word)

Definition at line 978 of file docqual.cpp.

978  {
979  TBOX box; // BB of outline
980  int16_t outline_count = 0;
981  int16_t small_outline_count = 0;
982  int16_t max_dimension;
983  float small_limit = kBlnXHeight * crunch_small_outlines_size;
984 
985  for (int b = 0; b < word->NumBlobs(); ++b) {
986  TBLOB* blob = word->blobs[b];
987  for (TESSLINE* ol = blob->outlines; ol != nullptr; ol = ol->next) {
988  outline_count++;
989  box = ol->bounding_box();
990  if (box.height() > box.width())
991  max_dimension = box.height();
992  else
993  max_dimension = box.width();
994  if (max_dimension < small_limit)
995  small_outline_count++;
996  }
997  }
998  return small_outline_count >= outline_count;
999 }
TESSLINE * next
Definition: blobs.h:265
Definition: rect.h:34
int NumBlobs() const
Definition: blobs.h:432
const int kBlnXHeight
Definition: normalis.h:24
int16_t width() const
Definition: rect.h:115
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
Definition: blobs.h:268
TESSLINE * outlines
Definition: blobs.h:384
int16_t height() const
Definition: rect.h:108

◆ non_0_digit()

bool tesseract::Tesseract::non_0_digit ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 794 of file reject.cpp.

794  {
795  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
796 }
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:686
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507

◆ non_O_upper()

bool tesseract::Tesseract::non_O_upper ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 790 of file reject.cpp.

790  {
791  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
792 }
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:686
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500

◆ num_sub_langs()

int tesseract::Tesseract::num_sub_langs ( ) const
inline

Definition at line 273 of file tesseractclass.h.

273  {
274  return sub_langs_.size();
275  }

◆ one_ell_conflict()

bool tesseract::Tesseract::one_ell_conflict ( WERD_RES word_res,
bool  update_map 
)

Definition at line 297 of file reject.cpp.

297  {
298  const char *word;
299  const char *lengths;
300  int16_t word_len; //its length
301  int16_t first_alphanum_index_;
302  int16_t first_alphanum_offset_;
303  int16_t i;
304  int16_t offset;
305  bool non_conflict_set_char; //non conf set a/n?
306  bool conflict = false;
307  bool allow_1s;
308  ACCEPTABLE_WERD_TYPE word_type;
309  bool dict_perm_type;
310  bool dict_word_ok;
311  int dict_word_type;
312 
313  word = word_res->best_choice->unichar_string().string ();
314  lengths = word_res->best_choice->unichar_lengths().string();
315  word_len = strlen(lengths);
316  /*
317  If there are no occurrences of the conflict set characters then the word
318  is OK.
319  */
320  if (strpbrk(word, conflict_set_I_l_1.string ()) == nullptr)
321  return false;
322 
323  /*
324  There is a conflict if there are NO other (confirmed) alphanumerics apart
325  from those in the conflict set.
326  */
327 
328  for (i = 0, offset = 0, non_conflict_set_char = false;
329  (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
330  non_conflict_set_char =
331  (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
332  word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
333  !STRING (conflict_set_I_l_1).contains (word[offset]);
334  if (!non_conflict_set_char) {
335  if (update_map)
336  reject_I_1_L(word_res);
337  return true;
338  }
339 
340  /*
341  If the word is accepted by a dawg permuter, and the first alpha character
342  is "I" or "l", check to see if the alternative is also a dawg word. If it
343  is, then there is a potential error otherwise the word is ok.
344  */
345 
346  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
347  (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
349  (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
350  (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
351  dict_word_type = dict_word(*(word_res->best_choice));
352  dict_word_ok = (dict_word_type > 0) &&
353  (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
354 
355  if ((rej_1Il_use_dict_word && dict_word_ok) ||
356  (rej_1Il_trust_permuter_type && dict_perm_type) ||
357  (dict_perm_type && dict_word_ok)) {
358  first_alphanum_index_ = first_alphanum_index (word, lengths);
359  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
360  if (lengths[first_alphanum_index_] == 1 &&
361  word[first_alphanum_offset_] == 'I') {
362  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
363  if (safe_dict_word(word_res) > 0) {
364  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
365  if (update_map)
366  word_res->reject_map[first_alphanum_index_].
367  setrej_1Il_conflict();
368  return true;
369  }
370  else {
371  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
372  return false;
373  }
374  }
375 
376  if (lengths[first_alphanum_index_] == 1 &&
377  word[first_alphanum_offset_] == 'l') {
378  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
379  if (safe_dict_word(word_res) > 0) {
380  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
381  if (update_map)
382  word_res->reject_map[first_alphanum_index_].
383  setrej_1Il_conflict();
384  return true;
385  }
386  else {
387  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
388  return false;
389  }
390  }
391  return false;
392  }
393 
394  /*
395  NEW 1Il code. The old code relied on permuter types too much. In fact,
396  tess will use TOP_CHOICE permute for good things like "palette".
397  In this code the string is examined independently to see if it looks like
398  a well formed word.
399  */
400 
401  /*
402  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
403  dictionary word.
404  */
405  first_alphanum_index_ = first_alphanum_index (word, lengths);
406  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
407  if (lengths[first_alphanum_index_] == 1 &&
408  word[first_alphanum_offset_] == 'l') {
409  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
410  if (safe_dict_word(word_res) > 0)
411  return false;
412  else
413  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
414  }
415  else if (lengths[first_alphanum_index_] == 1 &&
416  word[first_alphanum_offset_] == 'I') {
417  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
418  if (safe_dict_word(word_res) > 0)
419  return false;
420  else
421  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
422  }
423  /*
424  For strings containing digits:
425  If there are no alphas OR the numeric permuter liked the word,
426  reject any non 1 conflict chs
427  Else reject all conflict chs
428  */
429  if (word_contains_non_1_digit (word, lengths)) {
430  allow_1s = (alpha_count (word, lengths) == 0) ||
431  (word_res->best_choice->permuter () == NUMBER_PERM);
432 
433  int16_t offset;
434  conflict = false;
435  for (i = 0, offset = 0; word[offset] != '\0';
436  offset += word_res->best_choice->unichar_lengths()[i++]) {
437  if ((!allow_1s || (word[offset] != '1')) &&
438  STRING (conflict_set_I_l_1).contains (word[offset])) {
439  if (update_map)
440  word_res->reject_map[i].setrej_1Il_conflict ();
441  conflict = true;
442  }
443  }
444  return conflict;
445  }
446  /*
447  For anything else. See if it conforms to an acceptable word type. If so,
448  treat accordingly.
449  */
450  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
451  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
452  first_alphanum_index_ = first_alphanum_index (word, lengths);
453  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
454  if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
455  if (update_map)
456  word_res->reject_map[first_alphanum_index_].
457  setrej_1Il_conflict ();
458  return true;
459  }
460  else
461  return false;
462  }
463  else if (word_type == AC_UPPER_CASE) {
464  return false;
465  }
466  else {
467  if (update_map)
468  reject_I_1_L(word_res);
469  return true;
470  }
471 }
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:500
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:474
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:129
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
uint8_t permuter() const
Definition: ratngs.h:346
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:514
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:487
const STRING & unichar_lengths() const
Definition: ratngs.h:548
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:612
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1764
ALL but initial lc.
Definition: control.h:33
Definition: strngs.h:45
bool contains(const char c) const
Definition: strngs.cpp:187
const UNICHARSET * uch_set
Definition: pageres.h:206
ALL upper case.
Definition: control.h:32
const STRING & unichar_string() const
Definition: ratngs.h:541
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
WERD_CHOICE * best_choice
Definition: pageres.h:235
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:198
ALL lower case.
Definition: control.h:31

◆ output_pass()

void tesseract::Tesseract::output_pass ( PAGE_RES_IT page_res_it,
const TBOX target_word_box 
)

Definition at line 43 of file output.cpp.

45  {
46  BLOCK_RES *block_of_last_word;
47  bool force_eol; //During output
48  BLOCK *nextblock; //block of next word
49  WERD *nextword; //next word
50 
51  page_res_it.restart_page ();
52  block_of_last_word = nullptr;
53  while (page_res_it.word () != nullptr) {
54  check_debug_pt (page_res_it.word (), 120);
55 
56  if (target_word_box) {
57  TBOX current_word_box = page_res_it.word()->word->bounding_box();
58  FCOORD center_pt(
59  (current_word_box.right() + current_word_box.left()) / 2,
60  (current_word_box.bottom() + current_word_box.top()) / 2);
61  if (!target_word_box->contains(center_pt)) {
62  page_res_it.forward();
63  continue;
64  }
65  }
67  block_of_last_word != page_res_it.block ()) {
68  block_of_last_word = page_res_it.block ();
69  }
70 
71  force_eol = (tessedit_write_block_separators &&
72  (page_res_it.block () != page_res_it.next_block ())) ||
73  (page_res_it.next_word () == nullptr);
74 
75  if (page_res_it.next_word () != nullptr)
76  nextword = page_res_it.next_word ()->word;
77  else
78  nextword = nullptr;
79  if (page_res_it.next_block () != nullptr)
80  nextblock = page_res_it.next_block ()->block;
81  else
82  nextblock = nullptr;
83  //regardless of tilde crunching
84  write_results(page_res_it,
85  determine_newline_type(page_res_it.word()->word,
86  page_res_it.block()->block,
87  nextword, nextblock), force_eol);
88  page_res_it.forward();
89  }
90 }
BLOCK_RES * block() const
Definition: pageres.h:757
TBOX bounding_box() const
Definition: werd.cpp:159
Definition: rect.h:34
BLOCK_RES * next_block() const
Definition: pageres.h:766
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
Definition: output.cpp:105
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:220
WERD_RES * restart_page()
Definition: pageres.h:698
BLOCK * block
Definition: pageres.h:117
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
WERD_RES * next_word() const
Definition: pageres.h:760
WERD_RES * word() const
Definition: pageres.h:751
Definition: werd.h:59
Definition: ocrblock.h:30
bool contains(const FCOORD pt) const
Definition: rect.h:333
Definition: points.h:189
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868
int16_t right() const
Definition: rect.h:79
WERD_RES * forward()
Definition: pageres.h:731
int16_t bottom() const
Definition: rect.h:65
WERD * word
Definition: pageres.h:189

◆ ParseLanguageString()

void tesseract::Tesseract::ParseLanguageString ( const char *  lang_str,
GenericVector< STRING > *  to_load,
GenericVector< STRING > *  not_to_load 
)

Definition at line 262 of file tessedit.cpp.

264  {
265  STRING remains(lang_str);
266  while (remains.length() > 0) {
267  // Find the start of the lang code and which vector to add to.
268  const char* start = remains.string();
269  while (*start == '+')
270  ++start;
271  GenericVector<STRING>* target = to_load;
272  if (*start == '~') {
273  target = not_to_load;
274  ++start;
275  }
276  // Find the index of the end of the lang code in string start.
277  int end = strlen(start);
278  const char* plus = strchr(start, '+');
279  if (plus != nullptr && plus - start < end)
280  end = plus - start;
281  STRING lang_code(start);
282  lang_code.truncate_at(end);
283  STRING next(start + end);
284  remains = next;
285  // Check whether lang_code is already in the target vector and add.
286  if (!IsStrInList(lang_code, *target)) {
287  target->push_back(lang_code);
288  }
289  }
290 }
int push_back(T object)
Definition: strngs.h:45

◆ pgeditor_main()

void tesseract::Tesseract::pgeditor_main ( int  width,
int  height,
PAGE_RES page_res 
)

pgeditor_main()

Top level editor operation: Setup a new window and an according event handler

Definition at line 327 of file pgedit.cpp.

327  {
328  current_page_res = page_res;
329  if (current_page_res->block_res_list.empty())
330  return;
331 
332  recog_done = false;
333  stillRunning = true;
334 
335  build_image_window(width, height);
338 #ifndef GRAPHICS_DISABLED
339  pe = new ParamsEditor(this, image_win);
340 #endif
341  PGEventHandler pgEventHandler(this);
342 
343  image_win->AddEventHandler(&pgEventHandler);
344  image_win->AddMessageBox();
345 
346  SVMenuNode* svMenuRoot = build_menu_new();
347 
348  svMenuRoot->BuildMenu(image_win);
349  image_win->SetVisible(true);
350 
351  image_win->AwaitEvent(SVET_DESTROY);
352  image_win->AddEventHandler(nullptr);
353 }
void AddMessageBox()
Definition: scrollview.cpp:580
void SetVisible(bool visible)
Definition: scrollview.cpp:551
PAGE_RES * current_page_res
Definition: pgedit.cpp:121
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:298
void build_image_window(int width, int height)
Definition: pgedit.cpp:186
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81
void turn_on_bit(uint8_t bit_num)
Definition: bits16.h:38
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:445
SVMenuNode * build_menu_new()
Definition: pgedit.cpp:247
void AddEventHandler(SVEventHandler *listener)
Add an Event Listener to this ScrollView Window.
Definition: scrollview.cpp:416
bool word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:944
void BuildMenu(ScrollView *sv, bool menu_bar=true)
Definition: svmnode.cpp:120
BITS16 word_display_mode
Definition: pgedit.cpp:115

◆ pix_binary()

Pix* tesseract::Tesseract::pix_binary ( ) const
inline

Definition at line 201 of file tesseractclass.h.

201  {
202  return pix_binary_;
203  }

◆ pix_grey()

Pix* tesseract::Tesseract::pix_grey ( ) const
inline

Definition at line 204 of file tesseractclass.h.

204  {
205  return pix_grey_;
206  }

◆ pix_original()

Pix* tesseract::Tesseract::pix_original ( ) const
inline

Definition at line 211 of file tesseractclass.h.

211 { return pix_original_; }

◆ potential_word_crunch()

bool tesseract::Tesseract::potential_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level,
bool  ok_dict_word 
)

Definition at line 542 of file docqual.cpp.

544  {
545  float rating_per_ch;
546  int adjusted_len;
547  const char *str = word->best_choice->unichar_string().string();
548  const char *lengths = word->best_choice->unichar_lengths().string();
549  bool word_crunchable;
550  int poor_indicator_count = 0;
551 
552  word_crunchable = !crunch_leave_accept_strings ||
553  word->reject_map.length() < 3 ||
555  str, lengths) == AC_UNACCEPTABLE &&
556  !ok_dict_word);
557 
558  adjusted_len = word->reject_map.length();
559  if (adjusted_len > 10)
560  adjusted_len = 10;
561  rating_per_ch = word->best_choice->rating() / adjusted_len;
562 
563  if (rating_per_ch > crunch_pot_poor_rate) {
564  if (crunch_debug > 2) {
565  tprintf("Potential poor rating on \"%s\"\n",
566  word->best_choice->unichar_string().string());
567  }
568  poor_indicator_count++;
569  }
570 
571  if (word_crunchable &&
573  if (crunch_debug > 2) {
574  tprintf("Potential poor cert on \"%s\"\n",
575  word->best_choice->unichar_string().string());
576  }
577  poor_indicator_count++;
578  }
579 
580  if (garbage_level != G_OK) {
581  if (crunch_debug > 2) {
582  tprintf("Potential garbage on \"%s\"\n",
583  word->best_choice->unichar_string().string());
584  }
585  poor_indicator_count++;
586  }
587  return poor_indicator_count >= crunch_pot_indicators;
588 }
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
int32_t length() const
Definition: rejctmap.h:223
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
const STRING & unichar_lengths() const
Definition: ratngs.h:548
Definition: docqual.h:32
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1764
const UNICHARSET * uch_set
Definition: pageres.h:206
const STRING & unichar_string() const
Definition: ratngs.h:541
Unacceptable word.
Definition: control.h:30
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ PreenXHeights()

void tesseract::Tesseract::PreenXHeights ( BLOCK_LIST *  block_list)

◆ PrepareForPageseg()

void tesseract::Tesseract::PrepareForPageseg ( )

Definition at line 624 of file tesseractclass.cpp.

624  {
626  // Find the max splitter strategy over all langs.
627  ShiroRekhaSplitter::SplitStrategy max_pageseg_strategy =
629  static_cast<int32_t>(pageseg_devanagari_split_strategy));
630  for (int i = 0; i < sub_langs_.size(); ++i) {
631  ShiroRekhaSplitter::SplitStrategy pageseg_strategy =
633  static_cast<int32_t>(sub_langs_[i]->pageseg_devanagari_split_strategy));
634  if (pageseg_strategy > max_pageseg_strategy)
635  max_pageseg_strategy = pageseg_strategy;
636  pixDestroy(&sub_langs_[i]->pix_binary_);
637  sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
638  }
639  // Perform shiro-rekha (top-line) splitting and replace the current image by
640  // the newly split image.
641  splitter_.set_orig_pix(pix_binary());
642  splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
643  if (splitter_.Split(true, &pixa_debug_)) {
644  ASSERT_HOST(splitter_.splitted_image());
645  pixDestroy(&pix_binary_);
646  pix_binary_ = pixClone(splitter_.splitted_image());
647  }
648 }
Pix * pix_binary() const
void set_pageseg_split_strategy(SplitStrategy strategy)
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
void set_use_cjk_fp_model(bool flag)
Definition: textord.h:95
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ PrepareForTessOCR()

void tesseract::Tesseract::PrepareForTessOCR ( BLOCK_LIST *  block_list,
Tesseract osd_tess,
OSResults osr 
)

Definition at line 655 of file tesseractclass.cpp.

656  {
657  // Find the max splitter strategy over all langs.
658  ShiroRekhaSplitter::SplitStrategy max_ocr_strategy =
660  static_cast<int32_t>(ocr_devanagari_split_strategy));
661  for (int i = 0; i < sub_langs_.size(); ++i) {
662  ShiroRekhaSplitter::SplitStrategy ocr_strategy =
664  static_cast<int32_t>(sub_langs_[i]->ocr_devanagari_split_strategy));
665  if (ocr_strategy > max_ocr_strategy)
666  max_ocr_strategy = ocr_strategy;
667  }
668  // Utilize the segmentation information available.
669  splitter_.set_segmentation_block_list(block_list);
670  splitter_.set_ocr_split_strategy(max_ocr_strategy);
671  // Run the splitter for OCR
672  bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
673  // Restore pix_binary to the binarized original pix for future reference.
674  ASSERT_HOST(splitter_.orig_pix());
675  pixDestroy(&pix_binary_);
676  pix_binary_ = pixClone(splitter_.orig_pix());
677  // If the pageseg and ocr strategies are different, refresh the block list
678  // (from the last SegmentImage call) with blobs from the real image to be used
679  // for OCR.
680  if (splitter_.HasDifferentSplitStrategies()) {
681  BLOCK block("", TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
682  pixGetHeight(pix_binary_));
683  Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
684  splitter_.orig_pix();
685  extract_edges(pix_for_ocr, &block);
686  splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
687  }
688  // The splitter isn't needed any more after this, so save memory by clearing.
689  splitter_.Clear();
690 }
#define TRUE
Definition: capi.h:51
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
Definition: ocrblock.h:30
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
void set_ocr_split_strategy(SplitStrategy strategy)
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:334
void set_segmentation_block_list(BLOCK_LIST *block_list)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ PrerecAllWordsPar()

void tesseract::Tesseract::PrerecAllWordsPar ( const GenericVector< WordData > &  words)

Definition at line 39 of file par_control.cpp.

39  {
40  // Prepare all the blobs.
42  for (int w = 0; w < words.size(); ++w) {
43  if (words[w].word->ratings != nullptr &&
44  words[w].word->ratings->get(0, 0) == nullptr) {
45  for (int s = 0; s < words[w].lang_words.size(); ++s) {
46  Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
47  const WERD_RES& word = *words[w].lang_words[s];
48  for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
49  blobs.push_back(BlobData(b, sub, word));
50  }
51  }
52  }
53  }
54  // Pre-classify all the blobs.
55  if (tessedit_parallelize > 1) {
56 #ifdef _OPENMP
57 #pragma omp parallel for num_threads(10)
58 #endif // _OPENMP
59  for (int b = 0; b < blobs.size(); ++b) {
60  *blobs[b].choices =
61  blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, nullptr);
62  }
63  } else {
64  // TODO(AMD) parallelize this.
65  for (int b = 0; b < blobs.size(); ++b) {
66  *blobs[b].choices =
67  blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, nullptr);
68  }
69  }
70 }
int size() const
Definition: genericvector.h:71
Definition: callcpp.h:31
int NumBlobs() const
Definition: blobs.h:432
int push_back(T object)
TWERD * chopped_word
Definition: pageres.h:215

◆ process_cmd_win_event()

bool tesseract::Tesseract::process_cmd_win_event ( int32_t  cmd_event,
char *  new_value 
)

Definition at line 387 of file pgedit.cpp.

390  {
391  char msg[160];
392  bool exit = false;
393 
394  color_mode = CM_RAINBOW;
395 
396  // Run recognition on the full page if needed.
397  switch (cmd_event) {
398  case BLAMER_CMD_EVENT:
402  case SHOW_BOLD_CMD_EVENT:
408  if (!recog_done) {
409  recog_all_words(current_page_res, nullptr, nullptr, nullptr, 0);
410  recog_done = true;
411  }
412  break;
413  default:
414  break;
415  }
416 
417  char* parameter;
418 
419  switch (cmd_event) {
420  case NULL_CMD_EVENT:
421  break;
422 
424  case DUMP_WERD_CMD_EVENT:
427  case RECOG_WERDS:
428  case RECOG_PSEUDO:
429  case SHOW_BLOB_FEATURES:
430  mode =(CMD_EVENTS) cmd_event;
431  break;
433  mode = DEBUG_WERD_CMD_EVENT;
434  parameter = image_win->ShowInputDialog("Config File Name");
435  word_config_ = parameter;
436  delete[] parameter;
437  break;
439  if (new_value[0] == 'T')
441  else
443  mode = CHANGE_DISP_CMD_EVENT;
444  break;
445  case BLAMER_CMD_EVENT:
446  if (new_value[0] == 'T')
448  else
451  mode = CHANGE_DISP_CMD_EVENT;
452  break;
454  if (new_value[0] == 'T')
456  else
458  mode = CHANGE_DISP_CMD_EVENT;
459  break;
460  case POLYGONAL_CMD_EVENT:
461  if (new_value[0] == 'T')
463  else
465  mode = CHANGE_DISP_CMD_EVENT;
466  break;
467  case BL_NORM_CMD_EVENT:
468  if (new_value[0] == 'T')
470  else
472  mode = CHANGE_DISP_CMD_EVENT;
473  break;
474  case BITMAP_CMD_EVENT:
475  if (new_value[0] == 'T')
477  else
479  mode = CHANGE_DISP_CMD_EVENT;
480  break;
483  break;
484  case IMAGE_CMD_EVENT:
485  display_image =(new_value[0] == 'T');
487  break;
488  case BLOCKS_CMD_EVENT:
489  display_blocks =(new_value[0] == 'T');
491  break;
492  case BASELINES_CMD_EVENT:
493  display_baselines =(new_value[0] == 'T');
495  break;
497  color_mode = CM_SUBSCRIPT;
499  break;
501  color_mode = CM_SUPERSCRIPT;
503  break;
505  color_mode = CM_ITALIC;
507  break;
508  case SHOW_BOLD_CMD_EVENT:
509  color_mode = CM_BOLD;
511  break;
513  color_mode = CM_UNDERLINE;
515  break;
517  color_mode = CM_FIXEDPITCH;
519  break;
521  color_mode = CM_SERIF;
523  break;
525  color_mode = CM_SMALLCAPS;
527  break;
529  color_mode = CM_DROPCAPS;
531  break;
532  case REFRESH_CMD_EVENT:
534  break;
535  case QUIT_CMD_EVENT:
536  exit = true;
538  break;
539 
540  default:
541  snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)",
542  cmd_event, new_value);
543  image_win->AddMessage(msg);
544  break;
545  }
546  return exit;
547 }
BOOL8 display_baselines
Definition: pgedit.cpp:119
BOOL8 display_image
Definition: pgedit.cpp:117
PAGE_RES * current_page_res
Definition: pgedit.cpp:121
bool word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:759
BOOL8 display_blocks
Definition: pgedit.cpp:118
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:298
Definition: werd.h:54
Definition: werd.h:49
void turn_off_bit(uint8_t bit_num)
Definition: bits16.h:43
static void Exit()
Definition: scrollview.cpp:585
void turn_on_bit(uint8_t bit_num)
Definition: bits16.h:38
char * ShowInputDialog(const char *msg)
Definition: scrollview.cpp:736
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:308
bool word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:944
Definition: werd.h:50
void AddMessage(const char *format,...)
Definition: scrollview.cpp:563
BITS16 word_display_mode
Definition: pgedit.cpp:115

◆ process_image_event()

void tesseract::Tesseract::process_image_event ( const SVEvent event)

process_image_event()

User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.

Definition at line 559 of file pgedit.cpp.

560  {
561  // The following variable should remain static, since it is used by
562  // debug editor, which uses a single Tesseract instance.
563  static ICOORD down;
564  ICOORD up;
565  TBOX selection_box;
566  char msg[80];
567 
568  switch(event.type) {
569 
570  case SVET_SELECTION:
571  if (event.type == SVET_SELECTION) {
572  down.set_x(event.x + event.x_size);
573  down.set_y(event.y + event.y_size);
574  if (mode == SHOW_POINT_CMD_EVENT)
575  show_point(current_page_res, event.x, event.y);
576  }
577 
578  up.set_x(event.x);
579  up.set_y(event.y);
580 
581  selection_box = TBOX(down, up);
582 
583  switch(mode) {
587  selection_box,
589  break;
590  case DUMP_WERD_CMD_EVENT:
592  selection_box,
594  break;
597  selection_box,
599  break;
601  debug_word(current_page_res, selection_box);
602  break;
604  break; // ignore up event
605 
606  case RECOG_WERDS:
607  #ifndef DISABLED_LEGACY_ENGINE
608  image_win->AddMessage("Recogging selected words");
610  selection_box,
612  #endif // ndef DISABLED_LEGACY_ENGINE
613  break;
614  case RECOG_PSEUDO:
615  image_win->AddMessage("Recogging selected blobs");
616  recog_pseudo_word(current_page_res, selection_box);
617  break;
618  case SHOW_BLOB_FEATURES:
619  blob_feature_display(current_page_res, selection_box);
620  break;
621 
622  default:
623  sprintf(msg, "Mode %d not yet implemented", mode);
624  image_win->AddMessage(msg);
625  break;
626  }
627  default:
628  break;
629  }
630 }
PAGE_RES * current_page_res
Definition: pgedit.cpp:121
void set_x(int16_t xin)
rewrite function
Definition: points.h:62
Definition: rect.h:34
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:959
int x
Definition: scrollview.h:66
integer coordinate
Definition: points.h:32
bool word_blank_and_set_display(PAGE_RES_IT *pr_its)
Definition: pgedit.cpp:715
int y_size
Definition: scrollview.h:69
int x_size
Definition: scrollview.h:68
bool word_bln_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:727
void show_point(PAGE_RES *page_res, float x, float y)
Definition: pgedit.cpp:653
SVEventType type
Definition: scrollview.h:64
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:67
bool word_dumper(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:920
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
Definition: pagewalk.cpp:30
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:82
int y
Definition: scrollview.h:67
void set_y(int16_t yin)
rewrite function
Definition: points.h:66
void AddMessage(const char *format,...)
Definition: scrollview.cpp:563
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:637

◆ process_selected_words()

void tesseract::Tesseract::process_selected_words ( PAGE_RES page_res,
TBOX selection_box,
bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_processor 
)

Definition at line 30 of file pagewalk.cpp.

33  {
34  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr;
35  page_res_it.forward()) {
36  WERD* word = page_res_it.word()->word;
37  if (word->bounding_box().overlap(selection_box)) {
38  if (!(this->*word_processor)(&page_res_it))
39  return;
40  }
41  }
42 }
TBOX bounding_box() const
Definition: werd.cpp:159
WERD_RES * word() const
Definition: pageres.h:751
Definition: werd.h:59
bool overlap(const TBOX &box) const
Definition: rect.h:355

◆ ProcessTargetWord()

bool tesseract::Tesseract::ProcessTargetWord ( const TBOX word_box,
const TBOX target_word_box,
const char *  word_config,
int  pass 
)

Definition at line 125 of file control.cpp.

128  {
129  if (word_config != nullptr) {
130  if (word_box.major_overlap(target_word_box)) {
131  if (backup_config_file_ == nullptr) {
132  backup_config_file_ = kBackUpConfigFile;
133  FILE* config_fp = fopen(backup_config_file_, "wb");
134  if (config_fp == nullptr) {
135  tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
136  } else {
137  ParamUtils::PrintParams(config_fp, params());
138  fclose(config_fp);
139  }
140  ParamUtils::ReadParamsFile(word_config,
142  params());
143  }
144  } else {
145  if (backup_config_file_ != nullptr) {
146  ParamUtils::ReadParamsFile(backup_config_file_,
148  params());
149  backup_config_file_ = nullptr;
150  }
151  }
152  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
153  return false;
154  }
155  return true;
156 }
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:171
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:39
ParamsVectors * params()
Definition: ccutil.h:62
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
const char *const kBackUpConfigFile
Definition: control.cpp:53

◆ quality_based_rejection()

void tesseract::Tesseract::quality_based_rejection ( PAGE_RES_IT page_res_it,
bool  good_quality_doc 
)

Definition at line 139 of file docqual.cpp.

140  {
141  if ((tessedit_good_quality_unrej && good_quality_doc))
142  unrej_good_quality_words(page_res_it);
143  doc_and_block_rejection(page_res_it, good_quality_doc);
144  if (unlv_tilde_crunching) {
145  tilde_crunch(page_res_it);
146  tilde_delete(page_res_it);
147  }
148 }
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:418
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:161
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:233
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:590

◆ read_config_file()

void tesseract::Tesseract::read_config_file ( const char *  filename,
SetParamConstraint  constraint 
)

Definition at line 60 of file tessedit.cpp.

61  {
62  STRING path = datadir;
63  path += "configs/";
64  path += filename;
65  FILE* fp;
66  if ((fp = fopen(path.string(), "rb")) != nullptr) {
67  fclose(fp);
68  } else {
69  path = datadir;
70  path += "tessconfigs/";
71  path += filename;
72  if ((fp = fopen(path.string(), "rb")) != nullptr) {
73  fclose(fp);
74  } else {
75  path = filename;
76  }
77  }
78  ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
79 }
const char * string() const
Definition: strngs.cpp:196
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:39
ParamsVectors * params()
Definition: ccutil.h:62
Definition: strngs.h:45
STRING datadir
Definition: ccutil.h:64

◆ ReassignDiacritics()

bool tesseract::Tesseract::ReassignDiacritics ( int  pass,
PAGE_RES_IT pr_it,
bool *  make_next_word_fuzzy 
)

Definition at line 949 of file control.cpp.

950  {
951 #ifdef DISABLED_LEGACY_ENGINE
952  return false;
953 #else
954  *make_next_word_fuzzy = false;
955  WERD* real_word = pr_it->word()->word;
956  if (real_word->rej_cblob_list()->empty() ||
957  real_word->cblob_list()->empty() ||
958  real_word->rej_cblob_list()->length() > noise_maxperword)
959  return false;
960  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
961  // Get the noise outlines into a vector with matching bool map.
962  GenericVector<C_OUTLINE*> outlines;
963  real_word->GetNoiseOutlines(&outlines);
964  GenericVector<bool> word_wanted;
965  GenericVector<bool> overlapped_any_blob;
966  GenericVector<C_BLOB*> target_blobs;
967  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
968  &word_wanted, &overlapped_any_blob,
969  &target_blobs);
970  // Filter the outlines that overlapped any blob and put them into the word
971  // now. This simplifies the remaining task and also makes it more accurate
972  // as it has more completed blobs to work on.
973  GenericVector<bool> wanted;
974  GenericVector<C_BLOB*> wanted_blobs;
975  GenericVector<C_OUTLINE*> wanted_outlines;
976  int num_overlapped = 0;
977  int num_overlapped_used = 0;
978  for (int i = 0; i < overlapped_any_blob.size(); ++i) {
979  if (overlapped_any_blob[i]) {
980  ++num_overlapped;
981  if (word_wanted[i]) ++num_overlapped_used;
982  wanted.push_back(word_wanted[i]);
983  wanted_blobs.push_back(target_blobs[i]);
984  wanted_outlines.push_back(outlines[i]);
985  outlines[i] = nullptr;
986  }
987  }
988  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
989  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
990  &target_blobs);
991  int non_overlapped = 0;
992  int non_overlapped_used = 0;
993  for (int i = 0; i < word_wanted.size(); ++i) {
994  if (word_wanted[i]) ++non_overlapped_used;
995  if (outlines[i] != nullptr) ++non_overlapped_used;
996  }
997  if (debug_noise_removal) {
998  tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
999  num_overlapped_used, num_overlapped, non_overlapped_used,
1000  non_overlapped);
1001  real_word->bounding_box().print();
1002  }
1003  // Now we have decided which outlines we want, put them into the real_word.
1004  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
1005  make_next_word_fuzzy)) {
1006  pr_it->MakeCurrentWordFuzzy();
1007  }
1008  // TODO(rays) Parts of combos have a deep copy of the real word, and need
1009  // to have their noise outlines moved/assigned in the same way!!
1010  return num_overlapped_used != 0 || non_overlapped_used != 0;
1011 #endif // ndef DISABLED_LEGACY_ENGINE
1012 }
int size() const
Definition: genericvector.h:71
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:1074
void print() const
Definition: rect.h:278
TBOX bounding_box() const
Definition: werd.cpp:159
void GetNoiseOutlines(GenericVector< C_OUTLINE *> *outlines)
Definition: werd.cpp:529
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:93
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB *> &target_blobs, const GenericVector< C_OUTLINE *> &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:547
WERD_RES * word() const
Definition: pageres.h:751
Definition: werd.h:59
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
int push_back(T object)
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1483
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:125
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:1019
WERD * word
Definition: pageres.h:189

◆ recog_all_words()

bool tesseract::Tesseract::recog_all_words ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config,
int  dopasses 
)

recog_all_words()

Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.

Parameters
page_respage structure
monitorprogress monitor
word_configword_config file
target_word_boxspecifies just to extract a rectangle
dopasses0 - all, 1 just pass 1, 2 passes 2 and higher

Definition at line 308 of file control.cpp.

312  {
313  PAGE_RES_IT page_res_it(page_res);
314 
316  tessedit_test_adaption.set_value (TRUE);
317  tessedit_minimal_rejection.set_value (TRUE);
318  }
319 
320  if (dopasses==0 || dopasses==1) {
321  page_res_it.restart_page();
322  // ****************** Pass 1 *******************
323 
324  #ifndef DISABLED_LEGACY_ENGINE
325  // If the adaptive classifier is full switch to one we prepared earlier,
326  // ie on the previous page. If the current adaptive classifier is non-empty,
327  // prepare a backup starting at this page, in case it fills up. Do all this
328  // independently for each language.
329  if (AdaptiveClassifierIsFull()) {
331  } else if (!AdaptiveClassifierIsEmpty()) {
333  }
334  // Now check the sub-langs as well.
335  for (int i = 0; i < sub_langs_.size(); ++i) {
336  if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
337  sub_langs_[i]->SwitchAdaptiveClassifier();
338  } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
339  sub_langs_[i]->StartBackupAdaptiveClassifier();
340  }
341  }
342 
343  #endif // ndef DISABLED_LEGACY_ENGINE
344 
345  // Set up all words ready for recognition, so that if parallelism is on
346  // all the input and output classes are ready to run the classifier.
348  SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
349  #ifndef DISABLED_LEGACY_ENGINE
350  if (tessedit_parallelize) {
351  PrerecAllWordsPar(words);
352  }
353  #endif // ndef DISABLED_LEGACY_ENGINE
354 
355  stats_.word_count = words.size();
356 
357  stats_.dict_words = 0;
358  stats_.doc_blob_quality = 0;
359  stats_.doc_outline_errs = 0;
360  stats_.doc_char_quality = 0;
361  stats_.good_char_count = 0;
362  stats_.doc_good_char_quality = 0;
363 
364  most_recently_used_ = this;
365  // Run pass 1 word recognition.
366  if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
367  // Pass 1 post-processing.
368  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
369  page_res_it.forward()) {
370  if (page_res_it.word()->word->flag(W_REP_CHAR)) {
371  fix_rep_char(&page_res_it);
372  continue;
373  }
374 
375  // Count dict words.
376  if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
377  ++(stats_.dict_words);
378 
379  // Update misadaption log (we only need to do it on pass 1, since
380  // adaption only happens on this pass).
381  if (page_res_it.word()->blamer_bundle != nullptr &&
382  page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
383  page_res->misadaption_log.push_back(
384  page_res_it.word()->blamer_bundle->misadaption_debug());
385  }
386  }
387  }
388 
389  if (dopasses == 1) return true;
390 
391  #ifndef DISABLED_LEGACY_ENGINE
392 
393  // ****************** Pass 2 *******************
395  AnyTessLang()) {
396  page_res_it.restart_page();
398  SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
399  if (tessedit_parallelize) {
400  PrerecAllWordsPar(words);
401  }
402  most_recently_used_ = this;
403  // Run pass 2 word recognition.
404  if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
405  }
406 
407  // The next passes are only required for Tess-only.
408  if (AnyTessLang() && !AnyLSTMLang()) {
409  // ****************** Pass 3 *******************
410  // Fix fuzzy spaces.
412 
415  fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
416 
417  // ****************** Pass 4 *******************
420 
421  // ****************** Pass 5,6 *******************
422  rejection_passes(page_res, monitor, target_word_box, word_config);
423 
424  // ****************** Pass 8 *******************
425  font_recognition_pass(page_res);
426 
427  // ****************** Pass 9 *******************
428  // Check the correctness of the final results.
429  blamer_pass(page_res);
430  script_pos_pass(page_res);
431  }
432 
433  #endif // ndef DISABLED_LEGACY_ENGINE
434 
435  // Write results pass.
437  // This is now redundant, but retained commented so show how to obtain
438  // bounding boxes and style information.
439 
440  #ifndef DISABLED_LEGACY_ENGINE
441  // changed by jetsoft
442  // needed for dll to output memory structure
443  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
444  output_pass(page_res_it, target_word_box);
445  // end jetsoft
446  #endif //ndef DISABLED_LEGACY_ENGINE
447 
448  const PageSegMode pageseg_mode = static_cast<PageSegMode>(
449  static_cast<int>(tessedit_pageseg_mode));
450  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
451 
452  // Remove empty words, as these mess up the result iterators.
453  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
454  page_res_it.forward()) {
455  const WERD_RES* word = page_res_it.word();
456  const POLY_BLOCK* pb = page_res_it.block()->block != nullptr
457  ? page_res_it.block()->block->pdblk.poly_block()
458  : nullptr;
459  if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
460  (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
461  page_res_it.DeleteCurrentWord();
462  }
463  }
464 
465  if (monitor != nullptr) {
466  monitor->progress = 100;
467  }
468  return true;
469 }
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:159
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:2060
int size() const
Definition: genericvector.h:71
#define TRUE
Definition: capi.h:51
GenericVector< STRING > misadaption_log
Definition: pageres.h:92
#define LOC_FUZZY_SPACE
Definition: errcode.h:50
bool tessedit_enable_bigram_correction
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:326
bool AdaptiveClassifierIsFull() const
Definition: classify.h:325
#define LOC_WRITE_RESULTS
Definition: errcode.h:54
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:740
bool right_to_left() const
bool IsText() const
Definition: polyblk.h:49
int length() const
Definition: ratngs.h:303
int push_back(T object)
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:618
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1725
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:473
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:614
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:78
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:80
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:218
bool AnyTessLang() const
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:630
bool IsAllSpaces() const
Definition: ratngs.h:521
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:322
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:716
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2117
bool AnyLSTMLang() const
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:122
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:39
WERD_CHOICE * best_choice
Definition: pageres.h:235
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:43
WERD * word
Definition: pageres.h:189

◆ recog_interactive()

bool tesseract::Tesseract::recog_interactive ( PAGE_RES_IT pr_it)

Recognize a single word in interactive mode.

Parameters
pr_itthe page results iterator

Definition at line 82 of file control.cpp.

82  {
83  int16_t char_qual;
84  int16_t good_char_qual;
85 
86  WordData word_data(*pr_it);
87  SetupWordPassN(2, &word_data);
88  // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
89  if (lstm_recognizer_ == nullptr) {
90 #ifndef DISABLED_LEGACY_ENGINE
91  classify_word_and_language(2, pr_it, &word_data);
92 #endif // ndef DISABLED_LEGACY_ENGINE
93  } else {
94  classify_word_and_language(1, pr_it, &word_data);
95  }
96 #ifndef DISABLED_LEGACY_ENGINE
98  WERD_RES* word_res = pr_it->word();
99  word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
100  tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
101  "char_quality: %d; good_char_quality: %d\n",
102  word_res->reject_map.length(),
103  word_blob_quality(word_res, pr_it->row()->row),
104  word_outline_errs(word_res), char_qual, good_char_qual);
105  }
106 #endif // ndef DISABLED_LEGACY_ENGINE
107  return true;
108 }
ROW_RES * row() const
Definition: pageres.h:754
REJMAP reject_map
Definition: pageres.h:287
int32_t length() const
Definition: rejctmap.h:223
int16_t word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:61
WERD_RES * word() const
Definition: pageres.h:751
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:73
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:182
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:93
ROW * row
Definition: pageres.h:143
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1338

◆ recog_pseudo_word()

void tesseract::Tesseract::recog_pseudo_word ( PAGE_RES page_res,
TBOX selection_box 
)

Definition at line 67 of file control.cpp.

68  {
69  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
70  if (it != nullptr) {
72  it->DeleteCurrentWord();
73  delete it;
74  }
75 }
void DeleteCurrentWord()
Definition: pageres.cpp:1450
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:35
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:82

◆ recog_training_segmented()

void tesseract::Tesseract::recog_training_segmented ( const STRING fname,
PAGE_RES page_res,
volatile ETEXT_DESC monitor,
FILE *  output_file 
)

Definition at line 82 of file recogtraining.cpp.

85  {
86  STRING box_fname = fname;
87  const char *lastdot = strrchr(box_fname.string(), '.');
88  if (lastdot != nullptr) box_fname[lastdot - box_fname.string()] = '\0';
89  box_fname += ".box";
90  // ReadNextBox() will close box_file
91  FILE *box_file = fopen(box_fname.string(), "r");
92  if (box_file == nullptr) {
93  tprintf("Error: Could not open file %s\n", box_fname.string());
94  ASSERT_HOST(box_file);
95  }
96 
97  PAGE_RES_IT page_res_it;
98  page_res_it.page_res = page_res;
99  page_res_it.restart_page();
100  STRING label;
101 
102  // Process all the words on this page.
103  TBOX tbox; // tesseract-identified box
104  TBOX bbox; // box from the box file
105  bool keep_going;
106  int line_number = 0;
107  int examined_words = 0;
108  do {
109  keep_going = read_t(&page_res_it, &tbox);
110  keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label,
111  &bbox);
112  // Align bottom left points of the TBOXes.
113  while (keep_going &&
114  !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
115  if (bbox.bottom() < tbox.bottom()) {
116  page_res_it.forward();
117  keep_going = read_t(&page_res_it, &tbox);
118  } else {
119  keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
120  &bbox);
121  }
122  }
123  while (keep_going &&
124  !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
125  if (bbox.left() > tbox.left()) {
126  page_res_it.forward();
127  keep_going = read_t(&page_res_it, &tbox);
128  } else {
129  keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
130  &bbox);
131  }
132  }
133  // OCR the word if top right points of the TBOXes are similar.
134  if (keep_going &&
135  NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
136  NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
137  ambigs_classify_and_output(label.string(), &page_res_it, output_file);
138  examined_words++;
139  }
140  page_res_it.forward();
141  } while (keep_going);
142 
143  // Set up scripts on all of the words that did not get sent to
144  // ambigs_classify_and_output. They all should have, but if all the
145  // werd_res's don't get uch_sets, tesseract will crash when you try
146  // to iterate over them. :-(
147  int total_words = 0;
148  for (page_res_it.restart_page(); page_res_it.block() != nullptr;
149  page_res_it.forward()) {
150  if (page_res_it.word()) {
151  if (page_res_it.word()->uch_set == nullptr)
152  page_res_it.word()->SetupFake(unicharset);
153  total_words++;
154  }
155  }
156  if (examined_words < 0.85 * total_words) {
157  tprintf("TODO(antonova): clean up recog_training_segmented; "
158  " It examined only a small fraction of the ambigs image.\n");
159  }
160  tprintf("recog_training_segmented: examined %d / %d words.\n",
161  examined_words, total_words);
162 }
BLOCK_RES * block() const
Definition: pageres.h:757
const char * string() const
Definition: strngs.cpp:196
Definition: rect.h:34
WERD_RES * restart_page()
Definition: pageres.h:698
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
UNICHARSET unicharset
Definition: ccutil.h:68
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
WERD_RES * word() const
Definition: pageres.h:751
const int16_t kMaxBoxEdgeDiff
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
PAGE_RES * page_res
Definition: pageres.h:677
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:358
Definition: strngs.h:45
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:126
const UNICHARSET * uch_set
Definition: pageres.h:206
int16_t right() const
Definition: rect.h:79
WERD_RES * forward()
Definition: pageres.h:731
int16_t bottom() const
Definition: rect.h:65
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ recog_word()

void tesseract::Tesseract::recog_word ( WERD_RES word)

Definition at line 40 of file tfacepp.cpp.

40  {
41  if (wordrec_skip_no_truth_words && (word->blamer_bundle == nullptr ||
43  if (classify_debug_level) tprintf("No truth for word - skipping\n");
44  word->tess_failed = true;
45  return;
46  }
49  word->SetupBoxWord();
50  if (word->best_choice->length() != word->box_word->length()) {
51  tprintf("recog_word ASSERT FAIL String:\"%s\"; "
52  "Strlen=%d; #Blobs=%d\n",
53  word->best_choice->debug_string().string(),
54  word->best_choice->length(), word->box_word->length());
55  }
56  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
57  // Check that the ratings matrix size matches the sum of all the
58  // segmentation states.
59  if (!word->StatesAllValid()) {
60  tprintf("Not all words have valid states relative to ratings matrix!!");
61  word->DebugWordChoices(true, nullptr);
62  ASSERT_HOST(word->StatesAllValid());
63  }
65  /* Override the permuter type if a straight dictionary check disagrees. */
66  uint8_t perm_type = word->best_choice->permuter();
67  if ((perm_type != SYSTEM_DAWG_PERM) &&
68  (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
69  uint8_t real_dict_perm_type = dict_word(*word->best_choice);
70  if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
71  (real_dict_perm_type == FREQ_DAWG_PERM) ||
72  (real_dict_perm_type == USER_DAWG_PERM)) &&
74  word->best_choice->unichar_lengths().string()) > 0)) {
75  word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
76  }
77  }
79  perm_type != word->best_choice->permuter()) {
80  tprintf("Permuter Type Flipped from %d to %d\n",
81  perm_type, word->best_choice->permuter());
82  }
83  }
84  // Factored out from control.cpp
85  ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
86  if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
87  static_cast<int>(strspn(word->best_choice->unichar_string().string(),
88  " ")) == word->best_choice->length()) {
89  word->tess_failed = true;
90  word->reject_map.initialise(word->box_word->length());
92  } else {
93  word->tess_failed = false;
94  }
95 }
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:500
bool tess_failed
Definition: pageres.h:288
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:129
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
bool wordrec_skip_no_truth_words
Definition: wordrec.h:235
uint8_t permuter() const
Definition: ratngs.h:346
void SetupBoxWord()
Definition: pageres.cpp:855
const STRING & unichar_lengths() const
Definition: ratngs.h:548
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:486
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:118
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
const STRING debug_string() const
Definition: ratngs.h:505
int length() const
Definition: boxword.h:83
BlamerBundle * blamer_bundle
Definition: pageres.h:246
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:104
const STRING & unichar_string() const
Definition: ratngs.h:541
void rej_word_tess_failure()
Definition: rejctmap.cpp:354
WERD_CHOICE * raw_choice
Definition: pageres.h:240
TWERD * chopped_word
Definition: pageres.h:215
bool StatesAllValid()
Definition: pageres.cpp:464
WERD_CHOICE * best_choice
Definition: pageres.h:235
tesseract::BoxWord * box_word
Definition: pageres.h:266
void initialise(int16_t length)
Definition: rejctmap.cpp:275
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_permuter(uint8_t perm)
Definition: ratngs.h:375

◆ recog_word_recursive()

void tesseract::Tesseract::recog_word_recursive ( WERD_RES word)

Definition at line 104 of file tfacepp.cpp.

104  {
105  int word_length = word->chopped_word->NumBlobs(); // no of blobs
106  if (word_length > MAX_UNDIVIDED_LENGTH) {
107  return split_and_recog_word(word);
108  }
109  cc_recog(word);
110  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
111 
112  // Do sanity checks and minor fixes on best_choice.
113  if (word->best_choice->length() > word_length) {
114  word->best_choice->make_bad(); // should never happen
115  tprintf("recog_word: Discarded long string \"%s\""
116  " (%d characters vs %d blobs)\n",
117  word->best_choice->unichar_string().string(),
118  word->best_choice->length(), word_length);
119  tprintf("Word is at:");
120  word->word->bounding_box().print();
121  }
122  if (word->best_choice->length() < word_length) {
123  UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
124  while (word->best_choice->length() < word_length) {
125  word->best_choice->append_unichar_id(space_id, 1, 0.0,
126  word->best_choice->certainty());
127  }
128  }
129 }
#define MAX_UNDIVIDED_LENGTH
Definition: tfacepp.cpp:29
TWERD * rebuild_word
Definition: pageres.h:260
int UNICHAR_ID
Definition: unichar.h:35
void print() const
Definition: rect.h:278
const char * string() const
Definition: strngs.cpp:196
TBOX bounding_box() const
Definition: werd.cpp:159
int NumBlobs() const
Definition: blobs.h:432
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:468
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
float certainty() const
Definition: ratngs.h:330
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:443
void cc_recog(WERD_RES *word)
Definition: tface.cpp:113
UNICHARSET unicharset
Definition: ccutil.h:68
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:138
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
const STRING & unichar_string() const
Definition: ratngs.h:541
TWERD * chopped_word
Definition: pageres.h:215
WERD_CHOICE * best_choice
Definition: pageres.h:235
WERD * word
Definition: pageres.h:189

◆ RecogAllWordsPassN()

bool tesseract::Tesseract::RecogAllWordsPassN ( int  pass_n,
ETEXT_DESC monitor,
PAGE_RES_IT pr_it,
GenericVector< WordData > *  words 
)

Definition at line 218 of file control.cpp.

220  {
221  // TODO(rays) Before this loop can be parallelized (it would yield a massive
222  // speed-up) all remaining member globals need to be converted to local/heap
223  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
224  // added. The results will be significantly different with adaption on, and
225  // deterioration will need investigation.
226  pr_it->restart_page();
227  for (int w = 0; w < words->size(); ++w) {
228  WordData* word = &(*words)[w];
229  if (w > 0) word->prev_word = &(*words)[w - 1];
230  if (monitor != nullptr) {
231  monitor->ocr_alive = TRUE;
232  if (pass_n == 1) {
233  monitor->progress = 70 * w / words->size();
234  if (monitor->progress_callback2 != nullptr) {
235  TBOX box = pr_it->word()->word->bounding_box();
236  (*monitor->progress_callback2)(monitor, box.left(),
237  box.right(), box.top(), box.bottom());
238  }
239  } else {
240  monitor->progress = 70 + 30 * w / words->size();
241  if (monitor->progress_callback2 != nullptr) {
242  (*monitor->progress_callback2)(monitor, 0, 0, 0, 0);
243  }
244  }
245  if (monitor->deadline_exceeded() ||
246  (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this,
247  words->size()))) {
248  // Timeout. Fake out the rest of the words.
249  for (; w < words->size(); ++w) {
250  (*words)[w].word->SetupFake(unicharset);
251  }
252  return false;
253  }
254  }
255  if (word->word->tess_failed) {
256  int s;
257  for (s = 0; s < word->lang_words.size() &&
258  word->lang_words[s]->tess_failed; ++s) {}
259  // If all are failed, skip it. Image words are skipped by this test.
260  if (s > word->lang_words.size()) continue;
261  }
262  // Sync pr_it with the wth WordData.
263  while (pr_it->word() != nullptr && pr_it->word() != word->word)
264  pr_it->forward();
265  ASSERT_HOST(pr_it->word() != nullptr);
266  bool make_next_word_fuzzy = false;
267  if (!AnyLSTMLang() &&
268  ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
269  // Needs to be setup again to see the new outlines in the chopped_word.
270  SetupWordPassN(pass_n, word);
271  }
272 
273  classify_word_and_language(pass_n, pr_it, word);
275  tprintf("Pass%d: %s [%s]\n", pass_n,
276  word->word->best_choice->unichar_string().string(),
277  word->word->best_choice->debug_string().string());
278  }
279  pr_it->forward();
280  if (make_next_word_fuzzy && pr_it->word() != nullptr) {
281  pr_it->MakeCurrentWordFuzzy();
282  }
283  }
284  return true;
285 }
int size() const
Definition: genericvector.h:71
#define TRUE
Definition: capi.h:51
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:132
TBOX bounding_box() const
Definition: werd.cpp:159
Definition: rect.h:34
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:127
WERD_RES * restart_page()
Definition: pageres.h:698
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
UNICHARSET unicharset
Definition: ccutil.h:68
bool deadline_exceeded() const
Definition: ocrclass.h:164
WERD_RES * word() const
Definition: pageres.h:751
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1483
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:182
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:129
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
Definition: ocrclass.h:131
int16_t right() const
Definition: rect.h:79
WERD_RES * forward()
Definition: pageres.h:731
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:949
int16_t bottom() const
Definition: rect.h:65
bool AnyLSTMLang() const
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:122
#define ASSERT_HOST(x)
Definition: errcode.h:84
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1338
WERD * word
Definition: pageres.h:189

◆ recognize_page()

void tesseract::Tesseract::recognize_page ( STRING image_name)

◆ reject_edge_blobs()

void tesseract::Tesseract::reject_edge_blobs ( WERD_RES word)

Definition at line 268 of file reject.cpp.

268  {
269  TBOX word_box = word->word->bounding_box();
270  // Use the box_word as it is already denormed back to image coordinates.
271  int blobcount = word->box_word->length();
272 
273  if (word_box.left() < tessedit_image_border ||
274  word_box.bottom() < tessedit_image_border ||
275  word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
276  word_box.top() + tessedit_image_border > ImageHeight() - 1) {
277  ASSERT_HOST(word->reject_map.length() == blobcount);
278  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
279  TBOX blob_box = word->box_word->BlobBox(blobindex);
280  if (blob_box.left() < tessedit_image_border ||
281  blob_box.bottom() < tessedit_image_border ||
282  blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
283  blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
284  word->reject_map[blobindex].setrej_edge_char();
285  // Close to edge
286  }
287  }
288  }
289 }
REJMAP reject_map
Definition: pageres.h:287
TBOX bounding_box() const
Definition: werd.cpp:159
Definition: rect.h:34
int32_t length() const
Definition: rejctmap.h:223
const TBOX & BlobBox(int index) const
Definition: boxword.h:84
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
int length() const
Definition: boxword.h:83
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
tesseract::BoxWord * box_word
Definition: pageres.h:266
#define ASSERT_HOST(x)
Definition: errcode.h:84
WERD * word
Definition: pageres.h:189

◆ reject_I_1_L()

void tesseract::Tesseract::reject_I_1_L ( WERD_RES word)

Definition at line 198 of file reject.cpp.

198  {
199  int16_t i;
200  int16_t offset;
201 
202  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
203  offset += word->best_choice->unichar_lengths()[i], i += 1) {
205  contains (word->best_choice->unichar_string()[offset])) {
206  //rej 1Il conflict
207  word->reject_map[i].setrej_1Il_conflict ();
208  }
209  }
210 }
REJMAP reject_map
Definition: pageres.h:287
const STRING & unichar_lengths() const
Definition: ratngs.h:548
Definition: strngs.h:45
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ reject_mostly_rejects()

void tesseract::Tesseract::reject_mostly_rejects ( WERD_RES word)

Definition at line 578 of file reject.cpp.

578  {
579  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
580 
581  if ((float) word->reject_map.reject_count() / word->reject_map.length() >=
584 }
REJMAP reject_map
Definition: pageres.h:287
int32_t length() const
Definition: rejctmap.h:223
int16_t reject_count()
Definition: rejctmap.h:229
double rej_whole_of_mostly_reject_word_fract
void rej_word_mostly_rej()
Definition: rejctmap.cpp:408

◆ rejection_passes()

void tesseract::Tesseract::rejection_passes ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config 
)

Definition at line 618 of file control.cpp.

621  {
622  PAGE_RES_IT page_res_it(page_res);
623  // ****************** Pass 5 *******************
624  // Gather statistics on rejects.
625  int word_index = 0;
626  while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
628  WERD_RES* word = page_res_it.word();
629  word_index++;
630  if (monitor != nullptr) {
631  monitor->ocr_alive = TRUE;
632  monitor->progress = 95 + 5 * word_index / stats_.word_count;
633  }
634  if (word->rebuild_word == nullptr) {
635  // Word was not processed by tesseract.
636  page_res_it.forward();
637  continue;
638  }
639  check_debug_pt(word, 70);
640 
641  // changed by jetsoft
642  // specific to its needs to extract one word when need
643  if (target_word_box &&
645  *target_word_box, word_config, 4)) {
646  page_res_it.forward();
647  continue;
648  }
649  // end jetsoft
650 
651  page_res_it.rej_stat_word();
652  const int chars_in_word = word->reject_map.length();
653  const int rejects_in_word = word->reject_map.reject_count();
654 
655  const int blob_quality = word_blob_quality(word, page_res_it.row()->row);
656  stats_.doc_blob_quality += blob_quality;
657  const int outline_errs = word_outline_errs(word);
658  stats_.doc_outline_errs += outline_errs;
659  int16_t all_char_quality;
660  int16_t accepted_all_char_quality;
661  word_char_quality(word, page_res_it.row()->row,
662  &all_char_quality, &accepted_all_char_quality);
663  stats_.doc_char_quality += all_char_quality;
664  const uint8_t permuter_type = word->best_choice->permuter();
665  if ((permuter_type == SYSTEM_DAWG_PERM) ||
666  (permuter_type == FREQ_DAWG_PERM) ||
667  (permuter_type == USER_DAWG_PERM)) {
668  stats_.good_char_count += chars_in_word - rejects_in_word;
669  stats_.doc_good_char_quality += accepted_all_char_quality;
670  }
671  check_debug_pt(word, 80);
673  (blob_quality == 0) && (outline_errs >= chars_in_word))
675  check_debug_pt(word, 90);
676  page_res_it.forward();
677  }
678 
680  tprintf
681  ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
682  " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
683  page_res->char_count, page_res->rej_count,
684  page_res->rej_count / static_cast<float>(page_res->char_count),
685  stats_.doc_blob_quality,
686  stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
687  stats_.doc_outline_errs,
688  stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
689  stats_.doc_char_quality,
690  stats_.doc_char_quality / static_cast<float>(page_res->char_count),
691  stats_.doc_good_char_quality,
692  (stats_.good_char_count > 0) ?
693  (stats_.doc_good_char_quality /
694  static_cast<float>(stats_.good_char_count)) : 0.0);
695  }
696  bool good_quality_doc =
697  ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
698  quality_rej_pc) &&
699  (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
700  quality_blob_pc) &&
701  (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
703  (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
705 
706  // ****************** Pass 6 *******************
707  // Do whole document or whole block rejection pass
708  if (!tessedit_test_adaption) {
710  quality_based_rejection(page_res_it, good_quality_doc);
711  }
712 }
TWERD * rebuild_word
Definition: pageres.h:260
#define TRUE
Definition: capi.h:51
int32_t rej_count
Definition: pageres.h:80
REJMAP reject_map
Definition: pageres.h:287
TBOX bounding_box() const
Definition: werd.cpp:159
uint8_t permuter() const
Definition: ratngs.h:346
int32_t length() const
Definition: rejctmap.h:223
#define LOC_DOC_BLK_REJ
Definition: errcode.h:53
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:127
int16_t word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:61
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:125
int16_t reject_count()
Definition: rejctmap.h:229
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:73
int32_t char_count
Definition: pageres.h:79
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:80
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:139
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:93
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:122
#define LOC_MM_ADAPT
Definition: errcode.h:52
WERD_CHOICE * best_choice
Definition: pageres.h:235
void rej_word_bad_quality()
Definition: rejctmap.cpp:417
WERD * word
Definition: pageres.h:189

◆ repeated_nonalphanum_wd()

bool tesseract::Tesseract::repeated_nonalphanum_wd ( WERD_RES word,
ROW row 
)

Definition at line 587 of file reject.cpp.

587  {
588  int16_t char_quality;
589  int16_t accepted_char_quality;
590 
591  if (word->best_choice->unichar_lengths().length() <= 1)
592  return false;
593 
595  contains(word->best_choice->unichar_string()[0]))
596  return false;
597 
598  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
599  for (int i = 1; i < word->best_choice->length(); ++i) {
600  if (word->best_choice->unichar_id(i) != uch_id) return false;
601  }
602 
603  word_char_quality(word, row, &char_quality, &accepted_char_quality);
604 
605  if ((word->best_choice->unichar_lengths().length () == char_quality) &&
606  (char_quality == accepted_char_quality))
607  return true;
608  else
609  return false;
610 }
int UNICHAR_ID
Definition: unichar.h:35
const STRING & unichar_lengths() const
Definition: ratngs.h:548
char * ok_repeated_ch_non_alphanum_wds
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303
Definition: strngs.h:45
const STRING & unichar_string() const
Definition: ratngs.h:541
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:93
int32_t length() const
Definition: strngs.cpp:191
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ ReportFailedBox()

void tesseract::Tesseract::ReportFailedBox ( int  boxfile_lineno,
TBOX  box,
const char *  box_ch,
const char *  err_msg 
)

◆ ReportXhtFixResult()

void tesseract::Tesseract::ReportXhtFixResult ( bool  accept_new_word,
float  new_x_ht,
WERD_RES word,
WERD_RES new_word 
)

Definition at line 1481 of file control.cpp.

1482  {
1483  tprintf("New XHT Match:%s = %s ",
1484  word->best_choice->unichar_string().string(),
1485  word->best_choice->debug_string().string());
1486  word->reject_map.print(debug_fp);
1487  tprintf(" -> %s = %s ",
1488  new_word->best_choice->unichar_string().string(),
1489  new_word->best_choice->debug_string().string());
1490  new_word->reject_map.print(debug_fp);
1491  tprintf(" %s->%s %s %s\n",
1492  word->guessed_x_ht ? "GUESS" : "CERT",
1493  new_word->guessed_x_ht ? "GUESS" : "CERT",
1494  new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1495  accept_new_word ? "ACCEPTED" : "");
1496 }
bool guessed_x_ht
Definition: pageres.h:308
FILE * debug_fp
Definition: tessvars.cpp:24
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
const STRING debug_string() const
Definition: ratngs.h:505
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_CHOICE * best_choice
Definition: pageres.h:235
void print(FILE *fp)
Definition: rejctmap.cpp:323

◆ ReSegmentByClassification()

void tesseract::Tesseract::ReSegmentByClassification ( PAGE_RES page_res)

◆ ResegmentCharBox()

bool tesseract::Tesseract::ResegmentCharBox ( PAGE_RES page_res,
const TBOX prev_box,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

◆ ResegmentWordBox()

bool tesseract::Tesseract::ResegmentWordBox ( BLOCK_LIST *  block_list,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

◆ ResetAdaptiveClassifier()

void tesseract::Tesseract::ResetAdaptiveClassifier ( )

Definition at line 592 of file tesseractclass.cpp.

592  {
594  for (int i = 0; i < sub_langs_.size(); ++i) {
595  sub_langs_[i]->ResetAdaptiveClassifierInternal();
596  }
597 }
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:599

◆ ResetDocumentDictionary()

void tesseract::Tesseract::ResetDocumentDictionary ( )

Definition at line 602 of file tesseractclass.cpp.

602  {
604  for (int i = 0; i < sub_langs_.size(); ++i) {
605  sub_langs_[i]->getDict().ResetDocumentDictionary();
606  }
607 }
void ResetDocumentDictionary()
Definition: dict.h:311
Dict & getDict() override

◆ reskew()

const FCOORD& tesseract::Tesseract::reskew ( ) const
inline

Definition at line 193 of file tesseractclass.h.

193  {
194  return reskew_;
195  }

◆ RetryWithLanguage()

int tesseract::Tesseract::RetryWithLanguage ( const WordData word_data,
WordRecognizer  recognizer,
bool  debug,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  best_words 
)

Definition at line 910 of file control.cpp.

913  {
914  if (debug) {
915  tprintf("Trying word using lang %s, oem %d\n",
916  lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
917  }
918  // Run the recognizer on the word.
919  PointerVector<WERD_RES> new_words;
920  (this->*recognizer)(word_data, in_word, &new_words);
921  if (new_words.empty()) {
922  // Transfer input word to new_words, as the classifier must have put
923  // the result back in the input.
924  new_words.push_back(*in_word);
925  *in_word = nullptr;
926  }
927  if (debug) {
928  for (int i = 0; i < new_words.size(); ++i)
929  new_words[i]->DebugTopChoice("Lang result");
930  }
931  // Initial version is a bit of a hack based on better certainty and rating
932  // or a dictionary vs non-dictionary word.
933  return SelectBestWords(classify_max_rating_ratio,
935  debug, &new_words, best_words);
936 }
double classify_max_certainty_margin
Definition: classify.h:445
const char * string() const
Definition: strngs.cpp:196
STRING lang
Definition: ccutil.h:66
double classify_max_rating_ratio
Definition: classify.h:443
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

◆ right_to_left()

bool tesseract::Tesseract::right_to_left ( ) const
inline

Definition at line 270 of file tesseractclass.h.

270  {
271  return right_to_left_;
272  }

◆ RunOldFixXht()

bool tesseract::Tesseract::RunOldFixXht ( WERD_RES word,
BLOCK block,
ROW row 
)

◆ safe_dict_word()

int16_t tesseract::Tesseract::safe_dict_word ( const WERD_RES werd_res)

Definition at line 612 of file reject.cpp.

612  {
613  const WERD_CHOICE &word = *werd_res->best_choice;
614  int dict_word_type = werd_res->tesseract->dict_word(word);
615  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
616 }
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:129
tesseract::Tesseract * tesseract
Definition: pageres.h:282
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ scaled_color()

Pix* tesseract::Tesseract::scaled_color ( ) const
inline

Definition at line 253 of file tesseractclass.h.

253  {
254  return scaled_color_;
255  }

◆ scaled_factor()

int tesseract::Tesseract::scaled_factor ( ) const
inline

Definition at line 256 of file tesseractclass.h.

256  {
257  return scaled_factor_;
258  }

◆ script_pos_pass()

void tesseract::Tesseract::script_pos_pass ( PAGE_RES page_res)

Definition at line 740 of file control.cpp.

740  {
741  PAGE_RES_IT page_res_it(page_res);
742  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
743  page_res_it.forward()) {
744  WERD_RES* word = page_res_it.word();
745  if (word->word->flag(W_REP_CHAR)) {
746  page_res_it.forward();
747  continue;
748  }
749  const float x_height = page_res_it.block()->block->x_height();
750  float word_x_height = word->x_height;
751  if (word_x_height < word->best_choice->min_x_height() ||
752  word_x_height > word->best_choice->max_x_height()) {
753  word_x_height = (word->best_choice->min_x_height() +
754  word->best_choice->max_x_height()) / 2.0f;
755  }
756  // Test for small caps. Word capheight must be close to block xheight,
757  // and word must contain no lower case letters, and at least one upper case.
758  const double small_cap_xheight = x_height * kXHeightCapRatio;
759  const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
760  if (word->uch_set->script_has_xheight() &&
761  small_cap_xheight - small_cap_delta <= word_x_height &&
762  word_x_height <= small_cap_xheight + small_cap_delta) {
763  // Scan for upper/lower.
764  int num_upper = 0;
765  int num_lower = 0;
766  for (int i = 0; i < word->best_choice->length(); ++i) {
767  if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
768  ++num_upper;
769  else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
770  ++num_lower;
771  }
772  if (num_upper > 0 && num_lower == 0)
773  word->small_caps = true;
774  }
775  word->SetScriptPositions();
776  }
777 }
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493
bool script_has_xheight() const
Definition: unicharset.h:898
void SetScriptPositions()
Definition: pageres.cpp:864
bool small_caps
Definition: pageres.h:299
float max_x_height() const
Definition: ratngs.h:339
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303
float min_x_height() const
Definition: ratngs.h:336
float x_height
Definition: pageres.h:311
const UNICHARSET * uch_set
Definition: pageres.h:206
static const double kXHeightCapRatio
Definition: ccstruct.h:37
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500
WERD_CHOICE * best_choice
Definition: pageres.h:235
WERD * word
Definition: pageres.h:189

◆ SearchForText()

void tesseract::Tesseract::SearchForText ( const GenericVector< BLOB_CHOICE_LIST *> *  choices,
int  choices_pos,
int  choices_length,
const GenericVector< UNICHAR_ID > &  target_text,
int  text_index,
float  rating,
GenericVector< int > *  segmentation,
float *  best_rating,
GenericVector< int > *  best_segmentation 
)

◆ SearchWords()

void tesseract::Tesseract::SearchWords ( PointerVector< WERD_RES > *  words)

Definition at line 250 of file linerec.cpp.

250  {
251  // Run the segmentation search on the network outputs and make a BoxWord
252  // for each of the output words.
253  // If we drop a word as junk, then there is always a space in front of the
254  // next.
255  const Dict* stopper_dict = lstm_recognizer_->GetDict();
256  if (stopper_dict == nullptr) stopper_dict = &getDict();
257  bool any_nonspace_delimited = false;
258  for (int w = 0; w < words->size(); ++w) {
259  WERD_RES* word = (*words)[w];
260  if (word->best_choice != nullptr &&
262  any_nonspace_delimited = true;
263  break;
264  }
265  }
266  for (int w = 0; w < words->size(); ++w) {
267  WERD_RES* word = (*words)[w];
268  if (word->best_choice == nullptr) {
269  // It is a dud.
270  word->SetupFake(lstm_recognizer_->GetUnicharset());
271  } else {
272  // Set the best state.
273  for (int i = 0; i < word->best_choice->length(); ++i) {
274  int length = word->best_choice->state(i);
275  word->best_state.push_back(length);
276  }
277  word->reject_map.initialise(word->best_choice->length());
278  word->tess_failed = false;
279  word->tess_accepted = true;
280  word->tess_would_adapt = false;
281  word->done = true;
282  word->tesseract = this;
283  float word_certainty = std::min(word->space_certainty,
284  word->best_choice->certainty());
285  word_certainty *= kCertaintyScale;
286  if (getDict().stopper_debug_level >= 1) {
287  tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
288  word->best_choice->certainty(), word->space_certainty,
289  std::min(word->space_certainty, word->best_choice->certainty()) *
291  word_certainty);
292  word->best_choice->print();
293  }
294  word->best_choice->set_certainty(word_certainty);
295 
296  word->tess_accepted = stopper_dict->AcceptableResult(word);
297  }
298  }
299 }
const UNICHARSET & GetUnicharset() const
float space_certainty
Definition: pageres.h:316
bool tess_failed
Definition: pageres.h:288
int size() const
Definition: genericvector.h:71
Dict & getDict() override
REJMAP reject_map
Definition: pageres.h:287
void print() const
Definition: ratngs.h:580
int state(int index) const
Definition: ratngs.h:319
const float kCertaintyScale
Definition: linerec.cpp:36
const Dict * GetDict() const
float certainty() const
Definition: ratngs.h:330
int stopper_debug_level
Definition: dict.h:622
bool ContainsAnyNonSpaceDelimited() const
Definition: ratngs.h:514
bool tess_would_adapt
Definition: pageres.h:297
bool tess_accepted
Definition: pageres.h:296
GenericVector< int > best_state
Definition: pageres.h:271
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
int push_back(T object)
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:358
bool done
Definition: pageres.h:298
tesseract::Tesseract * tesseract
Definition: pageres.h:282
WERD_CHOICE * best_choice
Definition: pageres.h:235
void set_certainty(float new_val)
Definition: ratngs.h:372
void initialise(int16_t length)
Definition: rejctmap.cpp:275

◆ SegmentPage()

int tesseract::Tesseract::SegmentPage ( const STRING input_file,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr 
)

Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be nullptr. On return the blocks list owns all the constructed page layout.

Definition at line 100 of file pagesegmain.cpp.

101  {
102  ASSERT_HOST(pix_binary_ != nullptr);
103  int width = pixGetWidth(pix_binary_);
104  int height = pixGetHeight(pix_binary_);
105  // Get page segmentation mode.
106  PageSegMode pageseg_mode = static_cast<PageSegMode>(
107  static_cast<int>(tessedit_pageseg_mode));
108  // If a UNLV zone file can be found, use that instead of segmentation.
109  if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
110  input_file != nullptr && input_file->length() > 0) {
111  STRING name = *input_file;
112  const char* lastdot = strrchr(name.string(), '.');
113  if (lastdot != nullptr)
114  name[lastdot - name.string()] = '\0';
115  read_unlv_file(name, width, height, blocks);
116  }
117  if (blocks->empty()) {
118  // No UNLV file present. Work according to the PageSegMode.
119  // First make a single block covering the whole image.
120  BLOCK_IT block_it(blocks);
121  BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
123  block_it.add_to_end(block);
124  } else {
125  // UNLV file present. Use PSM_SINGLE_BLOCK.
126  pageseg_mode = PSM_SINGLE_BLOCK;
127  }
128  // The diacritic_blobs holds noise blobs that may be diacritics. They
129  // are separated out on areas of the image that seem noisy and short-circuit
130  // the layout process, going straight from the initial partition creation
131  // right through to after word segmentation, where they are added to the
132  // rej_cblobs list of the most appropriate word. From there classification
133  // will determine whether they are used.
134  BLOBNBOX_LIST diacritic_blobs;
135  int auto_page_seg_ret_val = 0;
136  TO_BLOCK_LIST to_blocks;
137  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
138  PSM_SPARSE(pageseg_mode)) {
139  auto_page_seg_ret_val = AutoPageSeg(
140  pageseg_mode, blocks, &to_blocks,
141  enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
142  if (pageseg_mode == PSM_OSD_ONLY)
143  return auto_page_seg_ret_val;
144  // To create blobs from the image region bounds uncomment this line:
145  // to_blocks.clear(); // Uncomment to go back to the old mode.
146  } else {
147  deskew_ = FCOORD(1.0f, 0.0f);
148  reskew_ = FCOORD(1.0f, 0.0f);
149  if (pageseg_mode == PSM_CIRCLE_WORD) {
150  Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
151  if (pixcleaned != nullptr) {
152  pixDestroy(&pix_binary_);
153  pix_binary_ = pixcleaned;
154  }
155  }
156  }
157 
158  if (auto_page_seg_ret_val < 0) {
159  return -1;
160  }
161 
162  if (blocks->empty()) {
164  tprintf("Empty page\n");
165  return 0; // AutoPageSeg found an empty page.
166  }
167  bool splitting =
169  bool cjk_mode = textord_use_cjk_fp_model;
170 
171  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
172  pix_thresholds_, pix_grey_, splitting || cjk_mode,
173  &diacritic_blobs, blocks, &to_blocks);
174  return auto_page_seg_ret_val;
175 }
#define TRUE
Definition: capi.h:51
bool PSM_SPARSE(int pageseg_mode)
Definition: publictypes.h:200
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:172
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:191
const char * string() const
Definition: strngs.cpp:196
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:197
bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:203
int textord_debug_tabfind
Definition: alignedblob.cpp:28
bool right_to_left() const
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: ocrblock.h:30
Treat the image as a single word in a circle.
Definition: publictypes.h:175
Definition: strngs.h:45
Definition: points.h:189
void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Pix *binary_pix, Pix *thresholds_pix, Pix *grey_pix, bool use_box_bottoms, BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: textord.cpp:230
bool read_unlv_file(STRING name, int32_t xsize, int32_t ysize, BLOCK_LIST *blocks)
Definition: blread.cpp:34
Orientation and script detection only.
Definition: publictypes.h:164
void set_right_to_left(bool value)
Definition: ocrblock.h:84
int32_t length() const
Definition: strngs.cpp:191
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ SelectGoodDiacriticOutlines()

bool tesseract::Tesseract::SelectGoodDiacriticOutlines ( int  pass,
float  certainty_threshold,
PAGE_RES_IT pr_it,
C_BLOB blob,
const GenericVector< C_OUTLINE *> &  outlines,
int  num_outlines,
GenericVector< bool > *  ok_outlines 
)

Definition at line 1152 of file control.cpp.

1155  {
1156 #ifndef DISABLED_LEGACY_ENGINE
1157  STRING best_str;
1158  float target_cert = certainty_threshold;
1159  if (blob != nullptr) {
1160  float target_c2;
1161  target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
1162  if (debug_noise_removal) {
1163  tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
1164  target_cert, target_c2);
1165  blob->bounding_box().print();
1166  }
1167  target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1168  }
1169  GenericVector<bool> test_outlines = *ok_outlines;
1170  // Start with all the outlines in.
1171  STRING all_str;
1172  GenericVector<bool> best_outlines = *ok_outlines;
1173  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1174  pr_it, blob, &all_str);
1175  if (debug_noise_removal) {
1176  TBOX ol_box;
1177  for (int i = 0; i < test_outlines.size(); ++i) {
1178  if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1179  }
1180  tprintf("All Noise blob classified as %s=%g, delta=%g at:",
1181  all_str.string(), best_cert, best_cert - target_cert);
1182  ol_box.print();
1183  }
1184  // Iteratively zero out the bit that improves the certainty the most, until
1185  // we get past the threshold, have zero bits, or fail to improve.
1186  int best_index = 0; // To zero out.
1187  while (num_outlines > 1 && best_index >= 0 &&
1188  (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1189  // Find the best bit to zero out.
1190  best_index = -1;
1191  for (int i = 0; i < outlines.size(); ++i) {
1192  if (test_outlines[i]) {
1193  test_outlines[i] = false;
1194  STRING str;
1195  float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1196  pr_it, blob, &str);
1197  if (debug_noise_removal) {
1198  TBOX ol_box;
1199  for (int j = 0; j < outlines.size(); ++j) {
1200  if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1201  tprintf("%d", test_outlines[j]);
1202  }
1203  tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
1204  cert, cert - target_cert);
1205  ol_box.print();
1206  }
1207  if (cert > best_cert) {
1208  best_cert = cert;
1209  best_index = i;
1210  best_outlines = test_outlines;
1211  }
1212  test_outlines[i] = true;
1213  }
1214  }
1215  if (best_index >= 0) {
1216  test_outlines[best_index] = false;
1217  --num_outlines;
1218  }
1219  }
1220  if (best_cert >= target_cert) {
1221  // Save the best combination.
1222  *ok_outlines = best_outlines;
1223  if (debug_noise_removal) {
1224  tprintf("%s noise combination ", blob ? "Adding" : "New");
1225  for (int i = 0; i < best_outlines.size(); ++i) {
1226  tprintf("%d", best_outlines[i]);
1227  }
1228  tprintf(" yields certainty %g, beating target of %g\n", best_cert,
1229  target_cert);
1230  }
1231  return true;
1232  }
1233 #endif // ndef DISABLED_LEGACY_ENGINE
1234  return false;
1235 }
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1287
int size() const
Definition: genericvector.h:71
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1239
void print() const
Definition: rect.h:278
const char * string() const
Definition: strngs.cpp:196
Definition: rect.h:34
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: strngs.h:45
TBOX bounding_box() const
Definition: stepblob.cpp:255

◆ set_done()

void tesseract::Tesseract::set_done ( WERD_RES word,
int16_t  pass 
)

◆ set_pix_grey()

void tesseract::Tesseract::set_pix_grey ( Pix *  grey_pix)
inline

Definition at line 207 of file tesseractclass.h.

207  {
208  pixDestroy(&pix_grey_);
209  pix_grey_ = grey_pix;
210  }

◆ set_pix_original()

void tesseract::Tesseract::set_pix_original ( Pix *  original_pix)
inline

Definition at line 213 of file tesseractclass.h.

213  {
214  pixDestroy(&pix_original_);
215  pix_original_ = original_pix;
216  // Clone to sublangs as well.
217  for (int i = 0; i < sub_langs_.size(); ++i)
218  sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
219  : nullptr);
220  }
void set_pix_original(Pix *original_pix)

◆ set_pix_thresholds()

void tesseract::Tesseract::set_pix_thresholds ( Pix *  thresholds)
inline

Definition at line 237 of file tesseractclass.h.

237  {
238  pixDestroy(&pix_thresholds_);
239  pix_thresholds_ = thresholds;
240  }

◆ set_source_resolution()

void tesseract::Tesseract::set_source_resolution ( int  ppi)
inline

Definition at line 244 of file tesseractclass.h.

244  {
245  source_resolution_ = ppi;
246  }

◆ set_unlv_suspects()

void tesseract::Tesseract::set_unlv_suspects ( WERD_RES word)

Definition at line 280 of file output.cpp.

280  {
281  int len = word_res->reject_map.length();
282  const WERD_CHOICE &word = *(word_res->best_choice);
283  const UNICHARSET &uchset = *word.unicharset();
284  int i;
285  float rating_per_ch;
286 
287  if (suspect_level == 0) {
288  for (i = 0; i < len; i++) {
289  if (word_res->reject_map[i].rejected())
290  word_res->reject_map[i].setrej_minimal_rej_accept();
291  }
292  return;
293  }
294 
295  if (suspect_level >= 3)
296  return; //Use defaults
297 
298  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
299 
300  if (safe_dict_word(word_res) &&
301  (count_alphas(word) > suspect_short_words)) {
302  /* Unreject alphas in dictionary words */
303  for (i = 0; i < len; ++i) {
304  if (word_res->reject_map[i].rejected() &&
305  uchset.get_isalpha(word.unichar_id(i)))
306  word_res->reject_map[i].setrej_minimal_rej_accept();
307  }
308  }
309 
310  rating_per_ch = word.rating() / word_res->reject_map.length();
311 
312  if (rating_per_ch >= suspect_rating_per_ch)
313  return; // Don't touch bad ratings
314 
315  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
316  /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
317  for (i = 0; i < len; ++i) {
318  if (word_res->reject_map[i].rejected() &&
319  (!uchset.eq(word.unichar_id(i), " ")))
320  word_res->reject_map[i].setrej_minimal_rej_accept();
321  }
322  }
323 
324  for (i = 0; i < len; i++) {
325  if (word_res->reject_map[i].rejected()) {
326  if (word_res->reject_map[i].flag(R_DOC_REJ))
327  word_res->reject_map[i].setrej_minimal_rej_accept();
328  if (word_res->reject_map[i].flag(R_BLOCK_REJ))
329  word_res->reject_map[i].setrej_minimal_rej_accept();
330  if (word_res->reject_map[i].flag(R_ROW_REJ))
331  word_res->reject_map[i].setrej_minimal_rej_accept();
332  }
333  }
334 
335  if (suspect_level == 2)
336  return;
337 
338  if (!suspect_constrain_1Il ||
339  (word_res->reject_map.length() <= suspect_short_words)) {
340  for (i = 0; i < len; i++) {
341  if (word_res->reject_map[i].rejected()) {
342  if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
343  word_res->reject_map[i].flag(R_POSTNN_1IL)))
344  word_res->reject_map[i].setrej_minimal_rej_accept();
345 
346  if (!suspect_constrain_1Il &&
347  word_res->reject_map[i].flag(R_MM_REJECT))
348  word_res->reject_map[i].setrej_minimal_rej_accept();
349  }
350  }
351  }
352 
353  if (acceptable_word_string(*word_res->uch_set,
354  word.unichar_string().string(),
355  word.unichar_lengths().string()) !=
356  AC_UNACCEPTABLE ||
358  word.unichar_lengths().string())) {
359  if (word_res->reject_map.length() > suspect_short_words) {
360  for (i = 0; i < len; i++) {
361  if (word_res->reject_map[i].rejected() &&
362  (!word_res->reject_map[i].perm_rejected() ||
363  word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
364  word_res->reject_map[i].flag (R_POSTNN_1IL) ||
365  word_res->reject_map[i].flag (R_MM_REJECT))) {
366  word_res->reject_map[i].setrej_minimal_rej_accept();
367  }
368  }
369  }
370  }
371 }
bool acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:394
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:686
const char * string() const
Definition: strngs.cpp:196
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
int16_t count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:373
float rating() const
Definition: ratngs.h:327
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
const STRING & unichar_lengths() const
Definition: ratngs.h:548
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:612
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1764
const STRING & unichar_string() const
Definition: ratngs.h:541
Unacceptable word.
Definition: control.h:30

◆ set_word_fonts()

void tesseract::Tesseract::set_word_fonts ( WERD_RES word)

set_word_fonts

Get the fonts for the word.

Definition at line 1981 of file control.cpp.

1981  {
1982  // Don't try to set the word fonts for an lstm word, as the configs
1983  // will be meaningless.
1984  if (word->chopped_word == nullptr) return;
1985  ASSERT_HOST(word->best_choice != nullptr);
1986 
1987 #ifndef DISABLED_LEGACY_ENGINE
1988  const int fontinfo_size = get_fontinfo_table().size();
1989  if (fontinfo_size == 0) return;
1990  GenericVector<int> font_total_score;
1991  font_total_score.init_to_size(fontinfo_size, 0);
1992 
1993  word->italic = 0;
1994  word->bold = 0;
1995  // Compute the font scores for the word
1996  if (tessedit_debug_fonts) {
1997  tprintf("Examining fonts in %s\n",
1998  word->best_choice->debug_string().string());
1999  }
2000  for (int b = 0; b < word->best_choice->length(); ++b) {
2001  const BLOB_CHOICE* choice = word->GetBlobChoice(b);
2002  if (choice == nullptr) continue;
2003  const GenericVector<ScoredFont>& fonts = choice->fonts();
2004  for (int f = 0; f < fonts.size(); ++f) {
2005  const int fontinfo_id = fonts[f].fontinfo_id;
2006  if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
2007  font_total_score[fontinfo_id] += fonts[f].score;
2008  }
2009  }
2010  }
2011  // Find the top and 2nd choice for the word.
2012  int score1 = 0, score2 = 0;
2013  int16_t font_id1 = -1, font_id2 = -1;
2014  for (int f = 0; f < fontinfo_size; ++f) {
2015  if (tessedit_debug_fonts && font_total_score[f] > 0) {
2016  tprintf("Font %s, total score = %d\n",
2017  fontinfo_table_.get(f).name, font_total_score[f]);
2018  }
2019  if (font_total_score[f] > score1) {
2020  score2 = score1;
2021  font_id2 = font_id1;
2022  score1 = font_total_score[f];
2023  font_id1 = f;
2024  } else if (font_total_score[f] > score2) {
2025  score2 = font_total_score[f];
2026  font_id2 = f;
2027  }
2028  }
2029  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : nullptr;
2030  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : nullptr;
2031  // Each score has a limit of UINT16_MAX, so divide by that to get the number
2032  // of "votes" for that font, ie number of perfect scores.
2033  word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
2034  word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
2035  if (score1 > 0) {
2036  const FontInfo fi = fontinfo_table_.get(font_id1);
2037  if (tessedit_debug_fonts) {
2038  if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
2039  tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
2040  fi.name, word->fontinfo_id_count,
2041  fontinfo_table_.get(font_id2).name,
2042  word->fontinfo_id2_count);
2043  } else {
2044  tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
2045  fi.name, word->fontinfo_id_count);
2046  }
2047  }
2048  word->italic = (fi.is_italic() ? 1 : -1) * word->fontinfo_id_count;
2049  word->bold = (fi.is_bold() ? 1 : -1) * word->fontinfo_id_count;
2050  }
2051 #endif // ndef DISABLED_LEGACY_ENGINE
2052 }
int size() const
Definition: genericvector.h:71
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
int8_t italic
Definition: pageres.h:301
const char * string() const
Definition: strngs.cpp:196
const FontInfo * fontinfo
Definition: pageres.h:304
bool is_bold() const
Definition: fontinfo.h:112
int8_t fontinfo_id2_count
Definition: pageres.h:307
void init_to_size(int size, const T &t)
bool is_italic() const
Definition: fontinfo.h:111
int8_t bold
Definition: pageres.h:302
const GenericVector< tesseract::ScoredFont > & fonts() const
Definition: ratngs.h:92
int8_t fontinfo_id_count
Definition: pageres.h:306
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
const STRING debug_string() const
Definition: ratngs.h:505
TWERD * chopped_word
Definition: pageres.h:215
WERD_CHOICE * best_choice
Definition: pageres.h:235
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:756
#define ASSERT_HOST(x)
Definition: errcode.h:84
const FontInfo * fontinfo2
Definition: pageres.h:305

◆ SetBlackAndWhitelist()

void tesseract::Tesseract::SetBlackAndWhitelist ( )

Definition at line 609 of file tesseractclass.cpp.

609  {
610  // Set the white and blacklists (if any)
612  tessedit_char_whitelist.string(),
613  tessedit_char_unblacklist.string());
614  // Black and white lists should apply to all loaded classifiers.
615  for (int i = 0; i < sub_langs_.size(); ++i) {
616  sub_langs_[i]->unicharset.set_black_and_whitelist(
618  tessedit_char_unblacklist.string());
619  }
620 }
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
Definition: unicharset.cpp:979
UNICHARSET unicharset
Definition: ccutil.h:68

◆ SetEquationDetect()

void tesseract::Tesseract::SetEquationDetect ( EquationDetect detector)

Definition at line 586 of file tesseractclass.cpp.

586  {
587  equ_detect_ = detector;
588  equ_detect_->SetLangTesseract(this);
589 }
void SetLangTesseract(Tesseract *lang_tesseract)

◆ SetScaledColor()

void tesseract::Tesseract::SetScaledColor ( int  factor,
Pix *  color 
)
inline

Definition at line 259 of file tesseractclass.h.

259  {
260  scaled_factor_ = factor;
261  scaled_color_ = color;
262  }

◆ SetupAllWordsPassN()

void tesseract::Tesseract::SetupAllWordsPassN ( int  pass_n,
const TBOX target_word_box,
const char *  word_config,
PAGE_RES page_res,
GenericVector< WordData > *  words 
)

If tesseract is to be run, sets the words up ready for it.

Definition at line 159 of file control.cpp.

163  {
164  // Prepare all the words.
165  PAGE_RES_IT page_res_it(page_res);
166  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
167  page_res_it.forward()) {
168  if (target_word_box == nullptr ||
169  ProcessTargetWord(page_res_it.word()->word->bounding_box(),
170  *target_word_box, word_config, 1)) {
171  words->push_back(WordData(page_res_it));
172  }
173  }
174  // Setup all the words for recognition with polygonal approximation.
175  for (int w = 0; w < words->size(); ++w) {
176  SetupWordPassN(pass_n, &(*words)[w]);
177  if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
178  }
179 }
int size() const
Definition: genericvector.h:71
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:125
int push_back(T object)
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:182

◆ SetupApplyBoxes()

PAGE_RES* tesseract::Tesseract::SetupApplyBoxes ( const GenericVector< TBOX > &  boxes,
BLOCK_LIST *  block_list 
)

◆ SetupPageSegAndDetectOrientation()

ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr,
TO_BLOCK_LIST *  to_blocks,
Pix **  photo_mask_pix,
Pix **  music_mask_pix 
)

Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a nullptr pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.

Definition at line 271 of file pagesegmain.cpp.

274  {
275  int vertical_x = 0;
276  int vertical_y = 1;
277  TabVector_LIST v_lines;
278  TabVector_LIST h_lines;
279  ICOORD bleft(0, 0);
280 
281  ASSERT_HOST(pix_binary_ != nullptr);
283  pixa_debug_.AddPix(pix_binary_, "PageSegInput");
284  }
285  // Leptonica is used to find the rule/separator lines in the input.
286  LineFinder::FindAndRemoveLines(source_resolution_,
287  textord_tabfind_show_vlines, pix_binary_,
288  &vertical_x, &vertical_y, music_mask_pix,
289  &v_lines, &h_lines);
291  pixa_debug_.AddPix(pix_binary_, "NoLines");
292  }
293  // Leptonica is used to find a mask of the photo regions in the input.
294  *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
296  pixa_debug_.AddPix(pix_binary_, "NoImages");
297  }
298  if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();
299 
300  // The rest of the algorithm uses the usual connected components.
301  textord_.find_components(pix_binary_, blocks, to_blocks);
302 
303  TO_BLOCK_IT to_block_it(to_blocks);
304  // There must be exactly one input block.
305  // TODO(rays) handle new textline finding with a UNLV zone file.
306  ASSERT_HOST(to_blocks->singleton());
307  TO_BLOCK* to_block = to_block_it.data();
308  TBOX blkbox = to_block->block->pdblk.bounding_box();
309  ColumnFinder* finder = nullptr;
310  int estimated_resolution = source_resolution_;
311  if (source_resolution_ == kMinCredibleResolution) {
312  // Try to estimate resolution from typical body text size.
313  int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);
314  if (res > estimated_resolution && res < kMaxCredibleResolution) {
315  estimated_resolution = res;
316  tprintf("Estimating resolution as %d\n", estimated_resolution);
317  }
318  }
319 
320  if (to_block->line_size >= 2) {
321  finder = new ColumnFinder(static_cast<int>(to_block->line_size),
322  blkbox.botleft(), blkbox.topright(),
323  estimated_resolution, textord_use_cjk_fp_model,
325  &h_lines, vertical_x, vertical_y);
326 
327  finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
328 
329 #ifndef DISABLED_LEGACY_ENGINE
330 
331  if (equ_detect_) {
332  equ_detect_->LabelSpecialText(to_block);
333  }
334 
335  BLOBNBOX_CLIST osd_blobs;
336  // osd_orientation is the number of 90 degree rotations to make the
337  // characters upright. (See osdetect.h for precise definition.)
338  // We want the text lines horizontal, (vertical text indicates vertical
339  // textlines) which may conflict (eg vertically written CJK).
340  int osd_orientation = 0;
341  bool vertical_text = textord_tabfind_force_vertical_text ||
342  pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
343  if (!vertical_text && textord_tabfind_vertical_text &&
344  PSM_ORIENTATION_ENABLED(pageseg_mode)) {
345  vertical_text =
346  finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
347  to_block, &osd_blobs);
348  }
349  if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {
350  GenericVector<int> osd_scripts;
351  if (osd_tess != this) {
352  // We are running osd as part of layout analysis, so constrain the
353  // scripts to those allowed by *this.
354  AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
355  for (int s = 0; s < sub_langs_.size(); ++s) {
356  AddAllScriptsConverted(sub_langs_[s]->unicharset,
357  osd_tess->unicharset, &osd_scripts);
358  }
359  }
360  os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
361  if (pageseg_mode == PSM_OSD_ONLY) {
362  delete finder;
363  return nullptr;
364  }
365  osd_orientation = osr->best_result.orientation_id;
366  double osd_score = osr->orientations[osd_orientation];
367  double osd_margin = min_orientation_margin * 2;
368  for (int i = 0; i < 4; ++i) {
369  if (i != osd_orientation &&
370  osd_score - osr->orientations[i] < osd_margin) {
371  osd_margin = osd_score - osr->orientations[i];
372  }
373  }
374  int best_script_id = osr->best_result.script_id;
375  const char* best_script_str =
376  osd_tess->unicharset.get_script_from_script_id(best_script_id);
377  bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
378  best_script_id == osd_tess->unicharset.hiragana_sid() ||
379  best_script_id == osd_tess->unicharset.katakana_sid() ||
380  strcmp("Japanese", best_script_str) == 0 ||
381  strcmp("Korean", best_script_str) == 0 ||
382  strcmp("Hangul", best_script_str) == 0;
383  if (cjk) {
384  finder->set_cjk_script(true);
385  }
386  if (osd_margin < min_orientation_margin) {
387  // The margin is weak.
388  if (!cjk && !vertical_text && osd_orientation == 2) {
389  // upside down latin text is improbable with such a weak margin.
390  tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
391  "Don't rotate.\n", osd_margin);
392  osd_orientation = 0;
393  } else {
394  tprintf(
395  "OSD: Weak margin (%.2f) for %d blob text block, "
396  "but using orientation anyway: %d\n",
397  osd_margin, osd_blobs.length(), osd_orientation);
398  }
399  }
400  }
401  osd_blobs.shallow_clear();
402  finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
403 
404 #endif // ndef DISABLED_LEGACY_ENGINE
405  }
406 
407  return finder;
408 }
const ICOORD & topright() const
Definition: rect.h:104
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:191
constexpr int kResolutionEstimationFactor
Definition: publictypes.h:45
int LabelSpecialText(TO_BLOCK *to_block)
Definition: rect.h:34
int script_id
Definition: osdetect.h:44
void find_components(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:219
float orientations[4]
Definition: osdetect.h:76
static void FindAndRemoveLines(int resolution, bool debug, Pix *pix, int *vertical_x, int *vertical_y, Pix **pix_music_mask, TabVector_LIST *v_lines, TabVector_LIST *h_lines)
Definition: linefind.cpp:241
OSBestResult best_result
Definition: osdetect.h:81
int orientation_id
Definition: osdetect.h:43
bool textord_tabfind_force_vertical_text
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:197
double textord_tabfind_vertical_text_ratio
bool PSM_ORIENTATION_ENABLED(int pageseg_mode)
Definition: publictypes.h:194
UNICHARSET unicharset
Definition: ccutil.h:68
integer coordinate
Definition: points.h:32
int IntCastRounded(double x)
Definition: helpers.h:168
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
const ICOORD & botleft() const
Definition: rect.h:92
static Pix * FindImages(Pix *pix, DebugPixa *pixa_debug)
Definition: imagefind.cpp:63
Orientation and script detection only.
Definition: publictypes.h:164
void AddPix(const Pix *pix, const char *caption)
Definition: debugpixa.h:26
constexpr int kMinCredibleResolution
Definition: publictypes.h:38
double textord_tabfind_aligned_gap_fraction
constexpr int kMaxCredibleResolution
Definition: publictypes.h:40
int os_detect_blobs(const GenericVector< int > *allowed_scripts, BLOBNBOX_CLIST *blob_list, OSResults *osr, tesseract::Tesseract *tess)
Definition: osdetect.cpp:278
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ SetupUniversalFontIds()

void tesseract::Tesseract::SetupUniversalFontIds ( )

Definition at line 441 of file tessedit.cpp.

441  {
442  // Note that we can get away with bitwise copying FontInfo in
443  // all_fonts, as it is a temporary structure and we avoid setting the
444  // delete callback.
445  UnicityTable<FontInfo> all_fonts;
447 
448  // Create the universal ID table.
449  CollectFonts(get_fontinfo_table(), &all_fonts);
450  for (int i = 0; i < sub_langs_.size(); ++i) {
451  CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
452  }
453  // Assign ids from the table to each font table.
454  AssignIds(all_fonts, &get_fontinfo_table());
455  for (int i = 0; i < sub_langs_.size(); ++i) {
456  AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
457  }
458  font_table_size_ = all_fonts.size();
459 }
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void set_compare_callback(TessResultCallback2< bool, T const &, T const &> *cb)
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:120
int size() const
Return the size used.

◆ SetupWordPassN()

void tesseract::Tesseract::SetupWordPassN ( int  pass_n,
WordData word 
)

Definition at line 182 of file control.cpp.

182  {
183  if (pass_n == 1 || !word->word->done) {
184  if (pass_n == 1) {
185  word->word->SetupForRecognition(unicharset, this, BestPix(),
186  tessedit_ocr_engine_mode, nullptr,
190  word->row, word->block);
191  } else if (pass_n == 2) {
192  // TODO(rays) Should we do this on pass1 too?
193  word->word->caps_height = 0.0;
194  if (word->word->x_height == 0.0f)
195  word->word->x_height = word->row->x_height();
196  }
197  word->lang_words.truncate(0);
198  for (int s = 0; s <= sub_langs_.size(); ++s) {
199  // The sub_langs_.size() entry is for the master language.
200  Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
201  WERD_RES* word_res = new WERD_RES;
202  word_res->InitForRetryRecognition(*word->word);
203  word->lang_words.push_back(word_res);
204  // LSTM doesn't get setup for pass2.
205  if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
206  word_res->SetupForRecognition(
207  lang_t->unicharset, lang_t, BestPix(),
208  lang_t->tessedit_ocr_engine_mode, nullptr,
209  lang_t->classify_bln_numeric_mode,
210  lang_t->textord_use_cjk_fp_model,
211  lang_t->poly_allow_detailed_fx, word->row, word->block);
212  }
213  }
214  }
215 }
bool classify_bln_numeric_mode
Definition: classify.h:541
Pix * BestPix() const
UNICHARSET unicharset
Definition: ccutil.h:68
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:308
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:283

◆ SetupWordScripts()

void tesseract::Tesseract::SetupWordScripts ( BLOCK_LIST *  blocks)

◆ source_resolution()

int tesseract::Tesseract::source_resolution ( ) const
inline

Definition at line 241 of file tesseractclass.h.

241  {
242  return source_resolution_;
243  }

◆ split_and_recog_word()

void tesseract::Tesseract::split_and_recog_word ( WERD_RES word)

Definition at line 138 of file tfacepp.cpp.

138  {
139  // Find the biggest blob gap in the chopped_word.
140  int bestgap = -INT32_MAX;
141  int split_index = 0;
142  for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
143  TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
144  TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
145  int gap = blob_box.left() - prev_box.right();
146  if (gap > bestgap) {
147  bestgap = gap;
148  split_index = b;
149  }
150  }
151  ASSERT_HOST(split_index > 0);
152 
153  WERD_RES *word2 = nullptr;
154  BlamerBundle *orig_bb = nullptr;
155  split_word(word, split_index, &word2, &orig_bb);
156 
157  // Recognize the first part of the word.
158  recog_word_recursive(word);
159  // Recognize the second part of the word.
160  recog_word_recursive(word2);
161 
162  join_words(word, word2, orig_bb);
163 }
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:234
Definition: rect.h:34
int NumBlobs() const
Definition: blobs.h:432
int16_t left() const
Definition: rect.h:72
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:104
int16_t right() const
Definition: rect.h:79
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:176
TWERD * chopped_word
Definition: pageres.h:215
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ split_word()

void tesseract::Tesseract::split_word ( WERD_RES word,
int  split_pt,
WERD_RES **  right_piece,
BlamerBundle **  orig_blamer_bundle 
) const

Definition at line 176 of file tfacepp.cpp.

179  {
180  ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
181 
182  // Save a copy of the blamer bundle so we can try to reconstruct it below.
183  BlamerBundle *orig_bb =
184  word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
185 
186  WERD_RES *word2 = new WERD_RES(*word);
187 
188  // blow away the copied chopped_word, as we want to work with
189  // the blobs from the input chopped_word so seam_arrays can be merged.
190  TWERD *chopped = word->chopped_word;
191  TWERD *chopped2 = new TWERD;
192  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
193  for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
194  chopped2->blobs.push_back(chopped->blobs[i]);
195  }
196  chopped->blobs.truncate(split_pt);
197  word->chopped_word = nullptr;
198  delete word2->chopped_word;
199  word2->chopped_word = nullptr;
200 
201  const UNICHARSET &unicharset = *word->uch_set;
202  word->ClearResults();
203  word2->ClearResults();
204  word->chopped_word = chopped;
205  word2->chopped_word = chopped2;
208 
209  // Try to adjust the blamer bundle.
210  if (orig_bb != nullptr) {
211  // TODO(rays) Looks like a leak to me.
212  // orig_bb should take, rather than copy.
213  word->blamer_bundle = new BlamerBundle();
214  word2->blamer_bundle = new BlamerBundle();
215  orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
216  word2->chopped_word->blobs[0]->bounding_box().left(),
218  word->blamer_bundle, word2->blamer_bundle);
219  }
220 
221  *right_piece = word2;
222  *orig_blamer_bundle = orig_bb;
223 }
Definition: blobs.h:402
int NumBlobs() const
Definition: blobs.h:432
void reserve(int size)
T & back() const
UNICHARSET unicharset
Definition: ccutil.h:68
TBOX bounding_box() const
Definition: blobs.cpp:478
int push_back(T object)
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
void ClearResults()
Definition: pageres.cpp:1153
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:349
const UNICHARSET * uch_set
Definition: pageres.h:206
BlamerBundle * blamer_bundle
Definition: pageres.h:246
int16_t right() const
Definition: rect.h:79
void truncate(int size)
bool wordrec_debug_blamer
Definition: wordrec.h:236
TWERD * chopped_word
Definition: pageres.h:215
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ SubAndSuperscriptFix()

bool tesseract::Tesseract::SubAndSuperscriptFix ( WERD_RES word)

Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, accept.

This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.

Returns
Whether we modified the given word.

Definition at line 102 of file superscript.cpp.

102  {
103  if (word->tess_failed || word->word->flag(W_REP_CHAR) ||
104  !word->best_choice) {
105  return false;
106  }
107  int num_leading, num_trailing;
108  ScriptPos sp_leading, sp_trailing;
109  float leading_certainty, trailing_certainty;
110  float avg_certainty, unlikely_threshold;
111 
112  // Calculate the number of whole suspicious characters at the edges.
114  word, &num_leading, &sp_leading, &leading_certainty,
115  &num_trailing, &sp_trailing, &trailing_certainty,
116  &avg_certainty, &unlikely_threshold);
117 
118  const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
119  const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
120 
121  int num_blobs = word->best_choice->length();
122 
123  // Calculate the remainder (partial characters) at the edges.
124  // This accounts for us having classified the best version of
125  // a word as [speaker?'] when it was instead [speaker.^{21}]
126  // (that is we accidentally thought the 2 was attached to the period).
127  int num_remainder_leading = 0, num_remainder_trailing = 0;
128  if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
129  int super_y_bottom =
131  int sub_y_top =
133  int last_word_char = num_blobs - 1 - num_trailing;
134  float last_char_certainty = word->best_choice->certainty(last_word_char);
135  if (word->best_choice->unichar_id(last_word_char) != 0 &&
136  last_char_certainty <= unlikely_threshold) {
137  ScriptPos rpos;
138  YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
139  nullptr, nullptr, &rpos, &num_remainder_trailing);
140  if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
141  if (num_remainder_trailing > 0 &&
142  last_char_certainty < trailing_certainty) {
143  trailing_certainty = last_char_certainty;
144  }
145  }
146  bool another_blob_available = (num_remainder_trailing == 0) ||
147  num_leading + num_trailing + 1 < num_blobs;
148  int first_char_certainty = word->best_choice->certainty(num_leading);
149  if (another_blob_available &&
150  word->best_choice->unichar_id(num_leading) != 0 &&
151  first_char_certainty <= unlikely_threshold) {
152  ScriptPos lpos;
153  YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
154  &lpos, &num_remainder_leading, nullptr, nullptr);
155  if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
156  if (num_remainder_leading > 0 &&
157  first_char_certainty < leading_certainty) {
158  leading_certainty = first_char_certainty;
159  }
160  }
161  }
162 
163  // If nothing to do, bail now.
164  if (num_leading + num_trailing +
165  num_remainder_leading + num_remainder_trailing == 0) {
166  return false;
167  }
168 
169  if (superscript_debug >= 1) {
170  tprintf("Candidate for superscript detection: %s (",
171  word->best_choice->unichar_string().string());
172  if (num_leading || num_remainder_leading) {
173  tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading,
174  leading_pos);
175  }
176  if (num_trailing || num_remainder_trailing) {
177  tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
178  trailing_pos);
179  }
180  tprintf(")\n");
181  }
182  if (superscript_debug >= 3) {
183  word->best_choice->print();
184  }
185  if (superscript_debug >= 2) {
186  tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ",
187  avg_certainty, unlikely_threshold);
188  if (num_leading)
189  tprintf("Orig. leading (min): %.2f ", leading_certainty);
190  if (num_trailing)
191  tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
192  tprintf("\n");
193  }
194 
195  // We've now calculated the number of rebuilt blobs we want to carve off.
196  // However, split_word() works from TBLOBs in chopped_word, so we need to
197  // convert to those.
198  int num_chopped_leading =
199  LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
200  int num_chopped_trailing =
201  TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
202 
203  int retry_leading = 0;
204  int retry_trailing = 0;
205  bool is_good = false;
206  WERD_RES *revised = TrySuperscriptSplits(
207  num_chopped_leading, leading_certainty, sp_leading,
208  num_chopped_trailing, trailing_certainty, sp_trailing,
209  word, &is_good, &retry_leading, &retry_trailing);
210  if (is_good) {
211  word->ConsumeWordResults(revised);
212  } else if (retry_leading || retry_trailing) {
213  int retry_chopped_leading =
214  LeadingUnicharsToChopped(revised, retry_leading);
215  int retry_chopped_trailing =
216  TrailingUnicharsToChopped(revised, retry_trailing);
217  WERD_RES *revised2 = TrySuperscriptSplits(
218  retry_chopped_leading, leading_certainty, sp_leading,
219  retry_chopped_trailing, trailing_certainty, sp_trailing,
220  revised, &is_good, &retry_leading, &retry_trailing);
221  if (is_good) {
222  word->ConsumeWordResults(revised2);
223  }
224  delete revised2;
225  }
226  delete revised;
227  return is_good;
228 }
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:771
bool tess_failed
Definition: pageres.h:288
const char * string() const
Definition: strngs.cpp:196
void print() const
Definition: ratngs.h:580
const int kBlnXHeight
Definition: normalis.h:24
float certainty() const
Definition: ratngs.h:330
const int kBlnBaselineOffset
Definition: normalis.h:25
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
const STRING & unichar_string() const
Definition: ratngs.h:541
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
WERD_CHOICE * best_choice
Definition: pageres.h:235
WERD * word
Definition: pageres.h:189

◆ terrible_word_crunch()

bool tesseract::Tesseract::terrible_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level 
)

Definition at line 504 of file docqual.cpp.

505  {
506  float rating_per_ch;
507  int adjusted_len;
508  int crunch_mode = 0;
509 
510  if ((word->best_choice->unichar_string().length() == 0) ||
511  (strspn(word->best_choice->unichar_string().string(), " ") ==
513  crunch_mode = 1;
514  else {
515  adjusted_len = word->reject_map.length ();
516  if (adjusted_len > crunch_rating_max)
517  adjusted_len = crunch_rating_max;
518  rating_per_ch = word->best_choice->rating () / adjusted_len;
519 
520  if (rating_per_ch > crunch_terrible_rating)
521  crunch_mode = 2;
522  else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
523  crunch_mode = 3;
524  else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
525  (garbage_level != G_OK))
526  crunch_mode = 4;
527  else if ((rating_per_ch > crunch_poor_garbage_rate) &&
528  (garbage_level != G_OK))
529  crunch_mode = 5;
530  }
531  if (crunch_mode > 0) {
532  if (crunch_debug > 2) {
533  tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
534  crunch_mode, word->best_choice->unichar_string().string());
535  }
536  return true;
537  }
538  else
539  return false;
540 }
uint32_t unsigned_size() const
Definition: strngs.h:71
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
int32_t length() const
Definition: rejctmap.h:223
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
Definition: docqual.h:32
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
const STRING & unichar_string() const
Definition: ratngs.h:541
int32_t length() const
Definition: strngs.cpp:191
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ tess_acceptable_word()

bool tesseract::Tesseract::tess_acceptable_word ( WERD_RES word)

Definition at line 62 of file tessbox.cpp.

62  {
63  return getDict().AcceptableResult(word);
64 }
Dict & getDict() override
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:101

◆ tess_add_doc_word()

void tesseract::Tesseract::tess_add_doc_word ( WERD_CHOICE word_choice)

Definition at line 72 of file tessbox.cpp.

72  {
73  getDict().add_document_word(*word_choice);
74 }
Dict & getDict() override
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:613

◆ tess_segment_pass_n()

void tesseract::Tesseract::tess_segment_pass_n ( int  pass_n,
WERD_RES word 
)

Definition at line 32 of file tessbox.cpp.

32  {
33  int saved_enable_assoc = 0;
34  int saved_chop_enable = 0;
35 
36  if (word->word->flag(W_DONT_CHOP)) {
37  saved_enable_assoc = wordrec_enable_assoc;
38  saved_chop_enable = chop_enable;
39  wordrec_enable_assoc.set_value(0);
40  chop_enable.set_value(0);
41  }
42  if (pass_n == 1)
43  set_pass1();
44  else
45  set_pass2();
46  recog_word(word);
47  if (word->best_choice == nullptr)
48  word->SetupFake(*word->uch_set);
49  if (word->word->flag(W_DONT_CHOP)) {
50  wordrec_enable_assoc.set_value(saved_enable_assoc);
51  chop_enable.set_value(saved_chop_enable);
52  }
53 }
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:40
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
void set_pass2()
Definition: tface.cpp:101
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:358
void set_pass1()
Definition: tface.cpp:89
const UNICHARSET * uch_set
Definition: pageres.h:206
bool wordrec_enable_assoc
Definition: wordrec.h:199
WERD_CHOICE * best_choice
Definition: pageres.h:235
WERD * word
Definition: pageres.h:189

◆ TestNewNormalization()

bool tesseract::Tesseract::TestNewNormalization ( int  original_misfits,
float  baseline_shift,
float  new_x_ht,
WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1538 of file control.cpp.

1540  {
1541  bool accept_new_x_ht = false;
1542  WERD_RES new_x_ht_word(word->word);
1543  if (word->blamer_bundle != nullptr) {
1544  new_x_ht_word.blamer_bundle = new BlamerBundle();
1545  new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1546  }
1547  new_x_ht_word.x_height = new_x_ht;
1548  new_x_ht_word.baseline_shift = baseline_shift;
1549  new_x_ht_word.caps_height = 0.0;
1550  new_x_ht_word.SetupForRecognition(
1551  unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1553  poly_allow_detailed_fx, row, block);
1554  match_word_pass_n(2, &new_x_ht_word, row, block);
1555  if (!new_x_ht_word.tess_failed) {
1556  int new_misfits = CountMisfitTops(&new_x_ht_word);
1557  if (debug_x_ht_level >= 1) {
1558  tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1559  original_misfits, word->x_height,
1560  new_misfits, new_x_ht);
1561  tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
1562  word->best_choice->rating(), word->best_choice->certainty(),
1563  new_x_ht_word.best_choice->rating(),
1564  new_x_ht_word.best_choice->certainty());
1565  }
1566  // The misfits must improve and either the rating or certainty.
1567  accept_new_x_ht = new_misfits < original_misfits &&
1568  (new_x_ht_word.best_choice->certainty() >
1569  word->best_choice->certainty() ||
1570  new_x_ht_word.best_choice->rating() <
1571  word->best_choice->rating());
1572  if (debug_x_ht_level >= 1) {
1573  ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1574  }
1575  }
1576  if (accept_new_x_ht) {
1577  word->ConsumeWordResults(&new_x_ht_word);
1578  return true;
1579  }
1580  return false;
1581 }
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:771
bool classify_bln_numeric_mode
Definition: classify.h:541
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
Pix * BestPix() const
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1649
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1481
UNICHARSET unicharset
Definition: ccutil.h:68
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
float x_height
Definition: pageres.h:311
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:70
BlamerBundle * blamer_bundle
Definition: pageres.h:246
WERD_CHOICE * best_choice
Definition: pageres.h:235
WERD * word
Definition: pageres.h:189

◆ textord()

const Textord& tesseract::Tesseract::textord ( ) const
inline

Definition at line 263 of file tesseractclass.h.

263  {
264  return textord_;
265  }

◆ TidyUp()

void tesseract::Tesseract::TidyUp ( PAGE_RES page_res)

◆ tilde_crunch()

void tesseract::Tesseract::tilde_crunch ( PAGE_RES_IT page_res_it)

Definition at line 418 of file docqual.cpp.

418  {
419  WERD_RES *word;
420  GARBAGE_LEVEL garbage_level;
421  PAGE_RES_IT copy_it;
422  bool prev_potential_marked = false;
423  bool found_terrible_word = false;
424  BOOL8 ok_dict_word;
425 
426  page_res_it.restart_page();
427  while (page_res_it.word() != nullptr) {
428  POLY_BLOCK* pb = page_res_it.block()->block->pdblk.poly_block();
429  if (pb != nullptr && !pb->IsText()) {
430  page_res_it.forward();
431  continue;
432  }
433  word = page_res_it.word();
434 
436  convert_bad_unlv_chs(word);
437 
439  word->merge_tess_fails();
440 
441  if (word->reject_map.accept_count () != 0) {
442  found_terrible_word = false;
443  //Forget earlier potential crunches
444  prev_potential_marked = false;
445  }
446  else {
447  ok_dict_word = safe_dict_word(word);
448  garbage_level = garbage_word(word, ok_dict_word);
449 
450  if ((garbage_level != G_NEVER_CRUNCH) &&
451  (terrible_word_crunch (word, garbage_level))) {
452  if (crunch_debug > 0) {
453  tprintf ("T CRUNCHING: \"%s\"\n",
454  word->best_choice->unichar_string().string());
455  }
457  if (prev_potential_marked) {
458  while (copy_it.word () != word) {
459  if (crunch_debug > 0) {
460  tprintf ("P1 CRUNCHING: \"%s\"\n",
461  copy_it.word()->best_choice->unichar_string().string());
462  }
463  copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
464  copy_it.forward ();
465  }
466  prev_potential_marked = false;
467  }
468  found_terrible_word = true;
469  }
470  else if ((garbage_level != G_NEVER_CRUNCH) &&
471  (potential_word_crunch (word,
472  garbage_level, ok_dict_word))) {
473  if (found_terrible_word) {
474  if (crunch_debug > 0) {
475  tprintf ("P2 CRUNCHING: \"%s\"\n",
476  word->best_choice->unichar_string().string());
477  }
479  }
480  else if (!prev_potential_marked) {
481  copy_it = page_res_it;
482  prev_potential_marked = true;
483  if (crunch_debug > 1) {
484  tprintf ("P3 CRUNCHING: \"%s\"\n",
485  word->best_choice->unichar_string().string());
486  }
487  }
488  }
489  else {
490  found_terrible_word = false;
491  //Forget earlier potential crunches
492  prev_potential_marked = false;
493  if (crunch_debug > 2) {
494  tprintf ("NO CRUNCH: \"%s\"\n",
495  word->best_choice->unichar_string().string());
496  }
497  }
498  }
499  page_res_it.forward ();
500  }
501 }
BLOCK_RES * block() const
Definition: pageres.h:757
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:504
void merge_tess_fails()
Definition: pageres.cpp:1073
bool crunch_early_convert_bad_unlv_chs
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:660
WERD_RES * restart_page()
Definition: pageres.h:698
BLOCK * block
Definition: pageres.h:117
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:310
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:612
int16_t accept_count()
Definition: rejctmap.cpp:281
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:542
WERD_RES * word() const
Definition: pageres.h:751
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
unsigned char BOOL8
Definition: host.h:34
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool IsText() const
Definition: polyblk.h:49
GARBAGE_LEVEL
Definition: docqual.h:29
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_RES * forward()
Definition: pageres.h:731
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
Definition: docqual.cpp:680
PDBLK pdblk
Definition: ocrblock.h:192
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ tilde_delete()

void tesseract::Tesseract::tilde_delete ( PAGE_RES_IT page_res_it)

Definition at line 590 of file docqual.cpp.

590  {
591  WERD_RES *word;
592  PAGE_RES_IT copy_it;
593  bool deleting_from_bol = false;
594  bool marked_delete_point = false;
595  int16_t debug_delete_mode;
596  CRUNCH_MODE delete_mode;
597  int16_t x_debug_delete_mode;
598  CRUNCH_MODE x_delete_mode;
599 
600  page_res_it.restart_page();
601  while (page_res_it.word() != nullptr) {
602  word = page_res_it.word();
603 
604  delete_mode = word_deletable (word, debug_delete_mode);
605  if (delete_mode != CR_NONE) {
606  if (word->word->flag (W_BOL) || deleting_from_bol) {
607  if (crunch_debug > 0) {
608  tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
609  debug_delete_mode,
610  word->best_choice->unichar_string().string());
611  }
612  word->unlv_crunch_mode = delete_mode;
613  deleting_from_bol = true;
614  } else if (word->word->flag(W_EOL)) {
615  if (marked_delete_point) {
616  while (copy_it.word() != word) {
617  x_delete_mode = word_deletable (copy_it.word (),
618  x_debug_delete_mode);
619  if (crunch_debug > 0) {
620  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
621  x_debug_delete_mode,
622  copy_it.word()->best_choice->unichar_string().string());
623  }
624  copy_it.word ()->unlv_crunch_mode = x_delete_mode;
625  copy_it.forward ();
626  }
627  }
628  if (crunch_debug > 0) {
629  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
630  debug_delete_mode,
631  word->best_choice->unichar_string().string());
632  }
633  word->unlv_crunch_mode = delete_mode;
634  deleting_from_bol = false;
635  marked_delete_point = false;
636  }
637  else {
638  if (!marked_delete_point) {
639  copy_it = page_res_it;
640  marked_delete_point = true;
641  }
642  }
643  }
644  else {
645  deleting_from_bol = false;
646  //Forget earlier potential crunches
647  marked_delete_point = false;
648  }
649  /*
650  The following step has been left till now as the tess fails are used to
651  determine if the word is deletable.
652  */
654  word->merge_tess_fails();
655  page_res_it.forward ();
656  }
657 }
const char * string() const
Definition: strngs.cpp:196
Definition: werd.h:35
void merge_tess_fails()
Definition: pageres.cpp:1073
WERD_RES * restart_page()
Definition: pageres.h:698
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
CRUNCH_MODE
Definition: pageres.h:159
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:310
WERD_RES * word() const
Definition: pageres.h:751
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: werd.h:34
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_RES * forward()
Definition: pageres.h:731
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:895
WERD_CHOICE * best_choice
Definition: pageres.h:235
WERD * word
Definition: pageres.h:189

◆ TrainedXheightFix()

bool tesseract::Tesseract::TrainedXheightFix ( WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1504 of file control.cpp.

1504  {
1505  int original_misfits = CountMisfitTops(word);
1506  if (original_misfits == 0)
1507  return false;
1508  float baseline_shift = 0.0f;
1509  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1510  if (baseline_shift != 0.0f) {
1511  // Try the shift on its own first.
1512  if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
1513  word, block, row))
1514  return false;
1515  original_misfits = CountMisfitTops(word);
1516  if (original_misfits > 0) {
1517  float new_baseline_shift;
1518  // Now recompute the new x_height.
1519  new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1520  if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1521  // No test of return value here, as we are definitely making a change
1522  // to the word by shifting the baseline.
1523  TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
1524  word, block, row);
1525  }
1526  }
1527  return true;
1528  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1529  return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
1530  word, block, row);
1531  } else {
1532  return false;
1533  }
1534 }
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:102
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1538
const double kMinRefitXHeightFraction
Definition: control.cpp:56
float x_height
Definition: pageres.h:311
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:70

◆ TrainFromBoxes()

void tesseract::Tesseract::TrainFromBoxes ( const GenericVector< TBOX > &  boxes,
const GenericVector< STRING > &  texts,
BLOCK_LIST *  block_list,
DocumentData training_data 
)

Definition at line 74 of file linerec.cpp.

77  {
78  int box_count = boxes.size();
79  // Process all the text lines in this page, as defined by the boxes.
80  int end_box = 0;
81  // Don't let \t, which marks newlines in the box file, get into the line
82  // content, as that makes the line unusable in training.
83  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
84  for (int start_box = end_box; start_box < box_count; start_box = end_box) {
85  // Find the textline of boxes starting at start and their bounding box.
86  TBOX line_box = boxes[start_box];
87  STRING line_str = texts[start_box];
88  for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t";
89  ++end_box) {
90  line_box += boxes[end_box];
91  line_str += texts[end_box];
92  }
93  // Find the most overlapping block.
94  BLOCK* best_block = nullptr;
95  int best_overlap = 0;
96  BLOCK_IT b_it(block_list);
97  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
98  BLOCK* block = b_it.data();
99  if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText())
100  continue; // Not a text block.
101  TBOX block_box = block->pdblk.bounding_box();
102  block_box.rotate(block->re_rotation());
103  if (block_box.major_overlap(line_box)) {
104  TBOX overlap_box = line_box.intersection(block_box);
105  if (overlap_box.area() > best_overlap) {
106  best_overlap = overlap_box.area();
107  best_block = block;
108  }
109  }
110  }
111  ImageData* imagedata = nullptr;
112  if (best_block == nullptr) {
113  tprintf("No block overlapping textline: %s\n", line_str.string());
114  } else {
115  imagedata = GetLineData(line_box, boxes, texts, start_box, end_box,
116  *best_block);
117  }
118  if (imagedata != nullptr)
119  training_data->AddPageToDocument(imagedata);
120  // Don't let \t, which marks newlines in the box file, get into the line
121  // content, as that makes the line unusable in training.
122  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
123  }
124 }
int size() const
Definition: genericvector.h:71
void rotate(const FCOORD &vec)
Definition: rect.h:197
FCOORD re_rotation() const
Definition: ocrblock.h:136
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
const char * string() const
Definition: strngs.cpp:196
Definition: rect.h:34
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
Definition: linerec.cpp:129
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool IsText() const
Definition: polyblk.h:49
Definition: ocrblock.h:30
int32_t area() const
Definition: rect.h:122
Definition: strngs.h:45
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:60
PDBLK pdblk
Definition: ocrblock.h:192

◆ TrainLineRecognizer()

void tesseract::Tesseract::TrainLineRecognizer ( const STRING input_imagename,
const STRING output_basename,
BLOCK_LIST *  block_list 
)

Definition at line 43 of file linerec.cpp.

45  {
46  STRING lstmf_name = output_basename + ".lstmf";
47  DocumentData images(lstmf_name);
48  if (applybox_page > 0) {
49  // Load existing document for the previous pages.
50  if (!images.LoadDocument(lstmf_name.string(), 0, 0, nullptr)) {
51  tprintf("Failed to read training data from %s!\n", lstmf_name.string());
52  return;
53  }
54  }
55  GenericVector<TBOX> boxes;
57  // Get the boxes for this page, if there are any.
58  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr,
59  nullptr) ||
60  boxes.empty()) {
61  tprintf("Failed to read boxes from %s\n", input_imagename.string());
62  return;
63  }
64  TrainFromBoxes(boxes, texts, block_list, &images);
65  images.Shuffle();
66  if (!images.SaveDocument(lstmf_name.string(), nullptr)) {
67  tprintf("Failed to write training data to %s!\n", lstmf_name.string());
68  }
69 }
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
Definition: linerec.cpp:74
const char * string() const
Definition: strngs.cpp:196
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:52
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: strngs.h:45

◆ TrySuperscriptSplits()

WERD_RES * tesseract::Tesseract::TrySuperscriptSplits ( int  num_chopped_leading,
float  leading_certainty,
ScriptPos  leading_pos,
int  num_chopped_trailing,
float  trailing_certainty,
ScriptPos  trailing_pos,
WERD_RES word,
bool *  is_good,
int *  retry_rebuild_leading,
int *  retry_rebuild_trailing 
)

Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.

Parameters
[in]num_chopped_leadinghow many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript)
[in]leading_certaintythe (minimum) certainty had by the characters in the original leading section.
[in]leading_pos"super" or "sub" (for debugging)
[in]num_chopped_trailinghow many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript)
[in]trailing_certaintythe (minimum) certainty had by the characters in the original trailing section.
[in]trailing_pos"super" or "sub" (for debugging)
[in]wordthe word to try to chop up.
[out]is_gooddo we believe our result?
[out]retry_rebuild_leading,retry_rebuild_trailingIf non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars.
Returns
A word which is the result of re-recognizing as asked.

Definition at line 383 of file superscript.cpp.

389  {
390  int num_chopped = word->chopped_word->NumBlobs();
391 
392  *retry_rebuild_leading = *retry_rebuild_trailing = 0;
393 
394  // Chop apart the word into up to three pieces.
395 
396  BlamerBundle *bb0 = nullptr;
397  BlamerBundle *bb1 = nullptr;
398  WERD_RES *prefix = nullptr;
399  WERD_RES *core = nullptr;
400  WERD_RES *suffix = nullptr;
401  if (num_chopped_leading > 0) {
402  prefix = new WERD_RES(*word);
403  split_word(prefix, num_chopped_leading, &core, &bb0);
404  } else {
405  core = new WERD_RES(*word);
406  }
407 
408  if (num_chopped_trailing > 0) {
409  int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
410  split_word(core, split_pt, &suffix, &bb1);
411  }
412 
413  // Recognize the pieces in turn.
414  int saved_cp_multiplier = classify_class_pruner_multiplier;
415  int saved_im_multiplier = classify_integer_matcher_multiplier;
416  if (prefix) {
417  // Turn off Tesseract's y-position penalties for the leading superscript.
420 
421  // Adjust our expectations about the baseline for this prefix.
422  if (superscript_debug >= 3) {
423  tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
424  }
425  recog_word_recursive(prefix);
426  if (superscript_debug >= 2) {
427  tprintf(" The leading bits look like %s %s\n",
428  ScriptPosToString(leading_pos),
429  prefix->best_choice->unichar_string().string());
430  }
431 
432  // Restore the normal y-position penalties.
433  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
434  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
435  }
436 
437  if (superscript_debug >= 3) {
438  tprintf(" recognizing middle %d chopped blobs\n",
439  num_chopped - num_chopped_leading - num_chopped_trailing);
440  }
441 
442  if (suffix) {
443  // Turn off Tesseract's y-position penalties for the trailing superscript.
446 
447  if (superscript_debug >= 3) {
448  tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
449  }
450  recog_word_recursive(suffix);
451  if (superscript_debug >= 2) {
452  tprintf(" The trailing bits look like %s %s\n",
453  ScriptPosToString(trailing_pos),
454  suffix->best_choice->unichar_string().string());
455  }
456 
457  // Restore the normal y-position penalties.
458  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
459  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
460  }
461 
462  // Evaluate whether we think the results are believably better
463  // than what we already had.
464  bool good_prefix = !prefix || BelievableSuperscript(
465  superscript_debug >= 1, *prefix,
466  superscript_bettered_certainty * leading_certainty,
467  retry_rebuild_leading, nullptr);
468  bool good_suffix = !suffix || BelievableSuperscript(
469  superscript_debug >= 1, *suffix,
470  superscript_bettered_certainty * trailing_certainty,
471  nullptr, retry_rebuild_trailing);
472 
473  *is_good = good_prefix && good_suffix;
474  if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
475  // None of it is any good. Quit now.
476  delete core;
477  delete prefix;
478  delete suffix;
479  delete bb1;
480  return nullptr;
481  }
482  recog_word_recursive(core);
483 
484  // Now paste the results together into core.
485  if (suffix) {
486  suffix->SetAllScriptPositions(trailing_pos);
487  join_words(core, suffix, bb1);
488  }
489  if (prefix) {
490  prefix->SetAllScriptPositions(leading_pos);
491  join_words(prefix, core, bb0);
492  core = prefix;
493  prefix = nullptr;
494  }
495 
496  if (superscript_debug >= 1) {
497  tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
498  core->best_choice->unichar_string().string());
499  }
500  return core;
501 }
double superscript_bettered_certainty
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:234
const char * string() const
Definition: strngs.cpp:196
int NumBlobs() const
Definition: blobs.h:432
int classify_class_pruner_multiplier
Definition: classify.h:506
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int classify_integer_matcher_multiplier
Definition: classify.h:510
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:200
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:104
const STRING & unichar_string() const
Definition: ratngs.h:541
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:176
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:871
TWERD * chopped_word
Definition: pageres.h:215
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ unrej_good_chs()

void tesseract::Tesseract::unrej_good_chs ( WERD_RES word,
ROW row 
)

Definition at line 116 of file docqual.cpp.

116  {
117  if (word->bln_boxes == nullptr ||
118  word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
119  return;
120 
121  DocQualCallbacks cb(word);
123  *word->rebuild_word,
125 }
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:45
TWERD * rebuild_word
Definition: pageres.h:260
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:190
tesseract::BoxWord * bln_boxes
Definition: pageres.h:198
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
bool empty() const
Definition: genericvector.h:90
GenericVector< TBLOB * > blobs
Definition: blobs.h:443

◆ unrej_good_quality_words()

void tesseract::Tesseract::unrej_good_quality_words ( PAGE_RES_IT page_res_it)

Definition at line 161 of file docqual.cpp.

162  {
163  WERD_RES *word;
164  ROW_RES *current_row;
165  BLOCK_RES *current_block;
166  int i;
167 
168  page_res_it.restart_page ();
169  while (page_res_it.word () != nullptr) {
170  check_debug_pt (page_res_it.word (), 100);
171  if (bland_unrej) {
172  word = page_res_it.word ();
173  for (i = 0; i < word->reject_map.length (); i++) {
174  if (word->reject_map[i].accept_if_good_quality ())
175  word->reject_map[i].setrej_quality_accept ();
176  }
177  page_res_it.forward ();
178  }
179  else if ((page_res_it.row ()->char_count > 0) &&
180  ((page_res_it.row ()->rej_count /
181  (float) page_res_it.row ()->char_count) <=
183  word = page_res_it.word ();
187  word->best_choice->unichar_string().string(),
189  != AC_UNACCEPTABLE)) {
190  unrej_good_chs(word, page_res_it.row ()->row);
191  }
192  page_res_it.forward ();
193  }
194  else {
195  /* Skip to end of dodgy row */
196  current_row = page_res_it.row ();
197  while ((page_res_it.word () != nullptr) &&
198  (page_res_it.row () == current_row))
199  page_res_it.forward ();
200  }
201  check_debug_pt (page_res_it.word (), 110);
202  }
203  page_res_it.restart_page ();
204  page_res_it.page_res->char_count = 0;
205  page_res_it.page_res->rej_count = 0;
206  current_block = nullptr;
207  current_row = nullptr;
208  while (page_res_it.word () != nullptr) {
209  if (current_block != page_res_it.block ()) {
210  current_block = page_res_it.block ();
211  current_block->char_count = 0;
212  current_block->rej_count = 0;
213  }
214  if (current_row != page_res_it.row ()) {
215  current_row = page_res_it.row ();
216  current_row->char_count = 0;
217  current_row->rej_count = 0;
218  current_row->whole_word_rej_count = 0;
219  }
220  page_res_it.rej_stat_word ();
221  page_res_it.forward ();
222  }
223 }
BLOCK_RES * block() const
Definition: pageres.h:757
int32_t rej_count
Definition: pageres.h:80
ROW_RES * row() const
Definition: pageres.h:754
int32_t whole_word_rej_count
Definition: pageres.h:146
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
int32_t char_count
Definition: pageres.h:118
bool quality_recoverable_rejects()
Definition: rejctmap.cpp:302
int32_t length() const
Definition: rejctmap.h:223
WERD_RES * restart_page()
Definition: pageres.h:698
void rej_stat_word()
Definition: pageres.cpp:1674
const STRING & unichar_lengths() const
Definition: ratngs.h:548
WERD_RES * word() const
Definition: pageres.h:751
int32_t char_count
Definition: pageres.h:79
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1764
PAGE_RES * page_res
Definition: pageres.h:677
int32_t rej_count
Definition: pageres.h:119
const UNICHARSET * uch_set
Definition: pageres.h:206
const STRING & unichar_string() const
Definition: ratngs.h:541
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868
WERD_RES * forward()
Definition: pageres.h:731
int32_t char_count
Definition: pageres.h:144
int32_t rej_count
Definition: pageres.h:145
Unacceptable word.
Definition: control.h:30
WERD_CHOICE * best_choice
Definition: pageres.h:235
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:116
ROW * row
Definition: pageres.h:143

◆ word_adaptable()

bool tesseract::Tesseract::word_adaptable ( WERD_RES word,
uint16_t  mode 
)

Definition at line 35 of file adaptions.cpp.

37  {
39  tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
41  word->best_choice->rating(), word->best_choice->certainty());
42  }
43 
44  BOOL8 status = FALSE;
45  BITS16 flags(mode);
46 
47  enum MODES
48  {
49  ADAPTABLE_WERD,
50  ACCEPTABLE_WERD,
51  CHECK_DAWGS,
52  CHECK_SPACES,
53  CHECK_ONE_ELL_CONFLICT,
54  CHECK_AMBIG_WERD
55  };
56 
57  /*
58  0: NO adaption
59  */
60  if (mode == 0) {
61  if (tessedit_adaption_debug) tprintf("adaption disabled\n");
62  return false;
63  }
64 
65  if (flags.bit (ADAPTABLE_WERD)) {
66  status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
67  if (tessedit_adaption_debug && !status) {
68  tprintf("tess_would_adapt bit is false\n");
69  }
70  }
71 
72  if (flags.bit (ACCEPTABLE_WERD)) {
73  status |= word->tess_accepted;
74  if (tessedit_adaption_debug && !status) {
75  tprintf("tess_accepted bit is false\n");
76  }
77  }
78 
79  if (!status) { // If not set then
80  return false; // ignore other checks
81  }
82 
83  if (flags.bit (CHECK_DAWGS) &&
84  (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
85  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
86  (word->best_choice->permuter () != USER_DAWG_PERM) &&
87  (word->best_choice->permuter () != NUMBER_PERM)) {
88  if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
89  return false;
90  }
91 
92  if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, false)) {
93  if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
94  return false;
95  }
96 
97  if (flags.bit (CHECK_SPACES) &&
98  (strchr(word->best_choice->unichar_string().string(), ' ') != nullptr)) {
99  if (tessedit_adaption_debug) tprintf("word contains spaces\n");
100  return false;
101  }
102 
103  if (flags.bit (CHECK_AMBIG_WERD) &&
105  if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
106  return false;
107  }
108 
110  tprintf("returning status %d\n", status);
111  }
112  return status;
113 }
const char * string() const
Definition: strngs.cpp:196
uint8_t permuter() const
Definition: ratngs.h:346
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
bool dangerous_ambig_found() const
Definition: ratngs.h:363
bool tess_would_adapt
Definition: pageres.h:297
#define FALSE
Definition: capi.h:52
bool tess_accepted
Definition: pageres.h:296
unsigned char BOOL8
Definition: host.h:34
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
const STRING & unichar_string() const
Definition: ratngs.h:541
Definition: bits16.h:25
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:297
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ word_blank_and_set_display()

bool tesseract::Tesseract::word_blank_and_set_display ( PAGE_RES_IT pr_its)

Definition at line 715 of file pgedit.cpp.

715  {
716  pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK,
718  return word_set_display(pr_it);
719 }
bool word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:944

◆ word_bln_display()

bool tesseract::Tesseract::word_bln_display ( PAGE_RES_IT pr_it)

word_bln_display()

Normalize word and display in word window

Definition at line 727 of file pgedit.cpp.

727  {
728  WERD_RES* word_res = pr_it->word();
729  if (word_res->chopped_word == nullptr) {
730  // Setup word normalization parameters.
731  word_res->SetupForRecognition(unicharset, this, BestPix(),
732  tessedit_ocr_engine_mode, nullptr,
736  pr_it->row()->row, pr_it->block()->block);
737  }
739  display_bln_lines(bln_word_window_handle(), ScrollView::CYAN,
740  1.0, 0.0f, -1000.0f, 1000.0f);
741  C_BLOB_IT it(word_res->word->cblob_list());
743  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
744  it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN,
746  color = WERD::NextColor(color);
747  }
749  return true;
750 }
BLOCK_RES * block() const
Definition: pageres.h:757
ROW_RES * row() const
Definition: pageres.h:754
static ScrollView::Color NextColor(ScrollView::Color colour)
Definition: werd.cpp:305
bool classify_bln_numeric_mode
Definition: classify.h:541
static void Update()
Definition: scrollview.cpp:711
Pix * BestPix() const
BLOCK * block
Definition: pageres.h:117
DENORM denorm
Definition: pageres.h:204
ScrollView * bln_word_window_handle()
Definition: pgedit.cpp:165
UNICHARSET unicharset
Definition: ccutil.h:68
WERD_RES * word() const
Definition: pageres.h:751
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:308
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
void Clear()
Definition: scrollview.cpp:591
TWERD * chopped_word
Definition: pageres.h:215
ROW * row
Definition: pageres.h:143
WERD * word
Definition: pageres.h:189

◆ word_blob_quality()

int16_t tesseract::Tesseract::word_blob_quality ( WERD_RES word,
ROW row 
)

Definition at line 61 of file docqual.cpp.

61  {
62  if (word->bln_boxes == nullptr ||
63  word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
64  return 0;
65 
66  DocQualCallbacks cb(word);
68  *word->rebuild_word,
70  return cb.match_count;
71 }
TWERD * rebuild_word
Definition: pageres.h:260
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:190
tesseract::BoxWord * bln_boxes
Definition: pageres.h:198
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void CountMatchingBlobs(int index)
Definition: docqual.cpp:35
bool empty() const
Definition: genericvector.h:90
GenericVector< TBLOB * > blobs
Definition: blobs.h:443

◆ word_char_quality()

void tesseract::Tesseract::word_char_quality ( WERD_RES word,
ROW row,
int16_t *  match_count,
int16_t *  accepted_match_count 
)

Definition at line 93 of file docqual.cpp.

96  {
97  if (word->bln_boxes == nullptr || word->rebuild_word == nullptr ||
98  word->rebuild_word->blobs.empty()) {
99  *match_count = 0;
100  *accepted_match_count = 0;
101  return;
102  }
103 
104  DocQualCallbacks cb(word);
106  *word->rebuild_word,
108  *match_count = cb.match_count;
109  *accepted_match_count = cb.accepted_match_count;
110 }
TWERD * rebuild_word
Definition: pageres.h:260
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:190
tesseract::BoxWord * bln_boxes
Definition: pageres.h:198
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:39
bool empty() const
Definition: genericvector.h:90
GenericVector< TBLOB * > blobs
Definition: blobs.h:443

◆ word_contains_non_1_digit()

bool tesseract::Tesseract::word_contains_non_1_digit ( const char *  word,
const char *  word_lengths 
)

Definition at line 514 of file reject.cpp.

515  {
516  int16_t i;
517  int16_t offset;
518 
519  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
520  if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
521  (word_lengths[i] != 1 || word[offset] != '1'))
522  return true;
523  }
524  return false;
525 }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
UNICHARSET unicharset
Definition: ccutil.h:68

◆ word_deletable()

CRUNCH_MODE tesseract::Tesseract::word_deletable ( WERD_RES word,
int16_t &  delete_mode 
)

Definition at line 895 of file docqual.cpp.

895  {
896  int word_len = word->reject_map.length ();
897  float rating_per_ch;
898  TBOX box; //BB of word
899 
900  if (word->unlv_crunch_mode == CR_NONE) {
901  delete_mode = 0;
902  return CR_NONE;
903  }
904 
905  if (word_len == 0) {
906  delete_mode = 1;
907  return CR_DELETE;
908  }
909 
910  if (word->rebuild_word != nullptr) {
911  // Cube leaves rebuild_word nullptr.
912  box = word->rebuild_word->bounding_box();
913  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
914  delete_mode = 4;
915  return CR_DELETE;
916  }
917 
918  if (noise_outlines(word->rebuild_word)) {
919  delete_mode = 5;
920  return CR_DELETE;
921  }
922  }
923 
924  if ((failure_count (word) * 1.5) > word_len) {
925  delete_mode = 2;
926  return CR_LOOSE_SPACE;
927  }
928 
929  if (word->best_choice->certainty () < crunch_del_cert) {
930  delete_mode = 7;
931  return CR_LOOSE_SPACE;
932  }
933 
934  rating_per_ch = word->best_choice->rating () / word_len;
935 
936  if (rating_per_ch > crunch_del_rating) {
937  delete_mode = 8;
938  return CR_LOOSE_SPACE;
939  }
940 
942  delete_mode = 9;
943  return CR_LOOSE_SPACE;
944  }
945 
946  if (box.bottom () >
948  delete_mode = 10;
949  return CR_LOOSE_SPACE;
950  }
951 
952  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
953  delete_mode = 11;
954  return CR_LOOSE_SPACE;
955  }
956 
957  if (box.width () < crunch_del_min_width * kBlnXHeight) {
958  delete_mode = 3;
959  return CR_LOOSE_SPACE;
960  }
961 
962  delete_mode = 0;
963  return CR_NONE;
964 }
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:966
TWERD * rebuild_word
Definition: pageres.h:260
REJMAP reject_map
Definition: pageres.h:287
TBOX bounding_box() const
Definition: blobs.cpp:871
Definition: rect.h:34
int32_t length() const
Definition: rejctmap.h:223
const int kBlnXHeight
Definition: normalis.h:24
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
const int kBlnBaselineOffset
Definition: normalis.h:25
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:978
int16_t width() const
Definition: rect.h:115
int16_t top() const
Definition: rect.h:58
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:310
int16_t bottom() const
Definition: rect.h:65
WERD_CHOICE * best_choice
Definition: pageres.h:235
int16_t height() const
Definition: rect.h:108

◆ word_display()

bool tesseract::Tesseract::word_display ( PAGE_RES_IT pr_it)

word_display() Word Processor

Display a word according to its display modes

Definition at line 759 of file pgedit.cpp.

759  {
760  WERD_RES* word_res = pr_it->word();
761  WERD* word = word_res->word;
762  TBOX word_bb; // word bounding box
763  int word_height; // ht of word BB
764  bool displayed_something = false;
765  float shift; // from bot left
766 
767  if (color_mode != CM_RAINBOW && word_res->box_word != nullptr) {
768  BoxWord* box_word = word_res->box_word;
769  WERD_CHOICE* best_choice = word_res->best_choice;
770  int length = box_word->length();
771  if (word_res->fontinfo == nullptr) return false;
772  const FontInfo& font_info = *word_res->fontinfo;
773  for (int i = 0; i < length; ++i) {
775  switch (color_mode) {
776  case CM_SUBSCRIPT:
777  if (best_choice->BlobPosition(i) == SP_SUBSCRIPT)
778  color = ScrollView::RED;
779  break;
780  case CM_SUPERSCRIPT:
781  if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT)
782  color = ScrollView::RED;
783  break;
784  case CM_ITALIC:
785  if (font_info.is_italic())
786  color = ScrollView::RED;
787  break;
788  case CM_BOLD:
789  if (font_info.is_bold())
790  color = ScrollView::RED;
791  break;
792  case CM_FIXEDPITCH:
793  if (font_info.is_fixed_pitch())
794  color = ScrollView::RED;
795  break;
796  case CM_SERIF:
797  if (font_info.is_serif())
798  color = ScrollView::RED;
799  break;
800  case CM_SMALLCAPS:
801  if (word_res->small_caps)
802  color = ScrollView::RED;
803  break;
804  case CM_DROPCAPS:
805  if (best_choice->BlobPosition(i) == SP_DROPCAP)
806  color = ScrollView::RED;
807  break;
808  // TODO(rays) underline is currently completely unsupported.
809  case CM_UNDERLINE:
810  default:
811  break;
812  }
813  image_win->Pen(color);
814  TBOX box = box_word->BlobBox(i);
815  image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
816  }
817  return true;
818  }
819  /*
820  Note the double coercions of(COLOUR)((int32_t)editor_image_word_bb_color)
821  etc. are to keep the compiler happy.
822  */
823  // display bounding box
824  if (word->display_flag(DF_BOX)) {
825  word->bounding_box().plot(image_win,
826  (ScrollView::Color)((int32_t)
828  (ScrollView::Color)((int32_t)
830 
832  ((int32_t) editor_image_blob_bb_color);
833  image_win->Pen(c);
834  // cblob iterator
835  C_BLOB_IT c_it(word->cblob_list());
836  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
837  c_it.data()->bounding_box().plot(image_win);
838  displayed_something = true;
839  }
840 
841  // display edge steps
842  if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
843  word->plot(image_win); // rainbow colors
844  displayed_something = true;
845  }
846 
847  // display poly approx
848  if (word->display_flag(DF_POLYGONAL)) {
849  // need to convert
851  tword->plot(image_win);
852  delete tword;
853  displayed_something = true;
854  }
855 
856  // Display correct text and blamer information.
857  STRING text;
858  STRING blame;
859  if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
860  text = word->text();
861  }
862  if (word->display_flag(DF_BLAMER) &&
863  !(word_res->blamer_bundle != nullptr &&
865  text = "";
866  const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
867  if (blamer_bundle == nullptr) {
868  text += "NULL";
869  } else {
870  text = blamer_bundle->TruthString();
871  }
872  text += " -> ";
873  STRING best_choice_str;
874  if (word_res->best_choice == nullptr) {
875  best_choice_str = "NULL";
876  } else {
877  word_res->best_choice->string_and_lengths(&best_choice_str, nullptr);
878  }
879  text += best_choice_str;
880  IncorrectResultReason reason = (blamer_bundle == nullptr) ?
881  IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
882  ASSERT_HOST(reason < IRR_NUM_REASONS)
883  blame += " [";
884  blame += BlamerBundle::IncorrectReasonName(reason);
885  blame += "]";
886  }
887  if (text.length() > 0) {
888  word_bb = word->bounding_box();
889  image_win->Pen(ScrollView::RED);
890  word_height = word_bb.height();
891  int text_height = 0.50 * word_height;
892  if (text_height > 20) text_height = 20;
893  image_win->TextAttributes("Arial", text_height, false, false, false);
894  shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
895  image_win->Text(word_bb.left() + shift,
896  word_bb.bottom() + 0.25 * word_height, text.string());
897  if (blame.length() > 0) {
898  image_win->Text(word_bb.left() + shift,
899  word_bb.bottom() + 0.25 * word_height - text_height,
900  blame.string());
901  }
902 
903  displayed_something = true;
904  }
905 
906  if (!displayed_something) // display BBox anyway
907  word->bounding_box().plot(image_win,
909  (ScrollView::Color)((int32_t)
911  return true;
912 }
int editor_image_blob_bb_color
Definition: pgedit.cpp:131
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:637
Definition: blobs.h:402
IncorrectResultReason
Definition: blamer.h:49
const char * string() const
Definition: strngs.cpp:196
TBOX bounding_box() const
Definition: werd.cpp:159
void plot(ScrollView *window, ScrollView::Color colour)
Definition: werd.cpp:296
Definition: rect.h:34
Definition: werd.h:54
const FontInfo * fontinfo
Definition: pageres.h:304
Definition: werd.h:49
bool is_bold() const
Definition: fontinfo.h:112
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:61
bool small_caps
Definition: pageres.h:299
int16_t width() const
Definition: rect.h:115
int16_t left() const
Definition: rect.h:72
void plot(ScrollView *fd) const
Definition: rect.h:286
int16_t top() const
Definition: rect.h:58
const char * text() const
Definition: werd.h:123
bool is_italic() const
Definition: fontinfo.h:111
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:654
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:118
STRING TruthString() const
Definition: blamer.h:112
void plot(ScrollView *window)
Definition: blobs.cpp:907
WERD_RES * word() const
Definition: pageres.h:751
Definition: werd.h:59
bool display_flag(uint8_t flag) const
Definition: werd.h:129
int length() const
Definition: ratngs.h:303
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:449
Definition: strngs.h:45
BlamerBundle * blamer_bundle
Definition: pageres.h:246
bool is_serif() const
Definition: fontinfo.h:114
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:322
int16_t right() const
Definition: rect.h:79
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:602
bool is_fixed_pitch() const
Definition: fontinfo.h:113
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:786
void Pen(Color color)
Definition: scrollview.cpp:722
int16_t bottom() const
Definition: rect.h:65
Definition: werd.h:50
int32_t length() const
Definition: strngs.cpp:191
WERD_CHOICE * best_choice
Definition: pageres.h:235
int16_t height() const
Definition: rect.h:108
tesseract::BoxWord * box_word
Definition: pageres.h:266
int editor_image_word_bb_color
Definition: pgedit.cpp:129
#define ASSERT_HOST(x)
Definition: errcode.h:84
WERD * word
Definition: pageres.h:189

◆ word_dumper()

bool tesseract::Tesseract::word_dumper ( PAGE_RES_IT pr_it)

word_dumper()

Dump members to the debug window

Definition at line 920 of file pgedit.cpp.

920  {
921  if (pr_it->block()->block != nullptr) {
922  tprintf("\nBlock data...\n");
923  pr_it->block()->block->print(nullptr, false);
924  }
925  tprintf("\nRow data...\n");
926  pr_it->row()->row->print(nullptr);
927  tprintf("\nWord data...\n");
928  WERD_RES* word_res = pr_it->word();
929  word_res->word->print();
930  if (word_res->blamer_bundle != nullptr && wordrec_debug_blamer &&
932  tprintf("Current blamer debug: %s\n",
933  word_res->blamer_bundle->debug().string());
934  }
935  return true;
936 }
BLOCK_RES * block() const
Definition: pageres.h:757
const STRING & debug() const
Definition: blamer.h:128
ROW_RES * row() const
Definition: pageres.h:754
void print(FILE *fp)
Definition: ocrrow.cpp:167
const char * string() const
Definition: strngs.cpp:196
void print()
Definition: werd.cpp:265
BLOCK * block
Definition: pageres.h:117
void print(FILE *fp, bool dump)
dump whole table
Definition: ocrblock.cpp:194
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:118
WERD_RES * word() const
Definition: pageres.h:751
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
BlamerBundle * blamer_bundle
Definition: pageres.h:246
bool wordrec_debug_blamer
Definition: wordrec.h:236
ROW * row
Definition: pageres.h:143
WERD * word
Definition: pageres.h:189

◆ word_outline_errs()

int16_t tesseract::Tesseract::word_outline_errs ( WERD_RES word)

Definition at line 73 of file docqual.cpp.

73  {
74  int16_t i = 0;
75  int16_t err_count = 0;
76 
77  if (word->rebuild_word != nullptr) {
78  for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
79  TBLOB* blob = word->rebuild_word->blobs[b];
80  err_count += count_outline_errs(word->best_choice->unichar_string()[i],
81  blob->NumOutlines());
82  i++;
83  }
84  }
85  return err_count;
86 }
TWERD * rebuild_word
Definition: pageres.h:260
int NumBlobs() const
Definition: blobs.h:432
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:127
int NumOutlines() const
Definition: blobs.cpp:464
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
const STRING & unichar_string() const
Definition: ratngs.h:541
Definition: blobs.h:268
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ word_set_display()

bool tesseract::Tesseract::word_set_display ( PAGE_RES_IT pr_it)

word_set_display() Word processor

Display word according to current display mode settings

Definition at line 944 of file pgedit.cpp.

944  {
945  WERD* word = pr_it->word()->word;
953  return word_display(pr_it);
954 }
bool word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:759
Definition: werd.h:54
Definition: werd.h:49
bool bit(uint8_t bit_num) const
Definition: bits16.h:57
WERD_RES * word() const
Definition: pageres.h:751
Definition: werd.h:59
Definition: werd.h:50
void set_display_flag(uint8_t flag, bool value)
Definition: werd.h:130
WERD * word
Definition: pageres.h:189
BITS16 word_display_mode
Definition: pgedit.cpp:115

◆ worst_noise_blob()

int16_t tesseract::Tesseract::worst_noise_blob ( WERD_RES word_res,
float *  worst_noise_score 
)

Definition at line 710 of file fixspace.cpp.

711  {
712  float noise_score[512];
713  int i;
714  int min_noise_blob; // 1st contender
715  int max_noise_blob; // last contender
716  int non_noise_count;
717  int worst_noise_blob; // Worst blob
718  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
719  float non_noise_limit = kBlnXHeight * 0.8;
720 
721  if (word_res->rebuild_word == nullptr)
722  return -1; // Can't handle cube words.
723 
724  // Normalised.
725  int blob_count = word_res->box_word->length();
726  ASSERT_HOST(blob_count <= 512);
727  if (blob_count < 5)
728  return -1; // too short to split
729 
730  /* Get the noise scores for all blobs */
731 
732  #ifndef SECURE_NAMES
733  if (debug_fix_space_level > 5)
734  tprintf("FP fixspace Noise metrics for \"%s\": ",
735  word_res->best_choice->unichar_string().string());
736  #endif
737 
738  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
739  TBLOB* blob = word_res->rebuild_word->blobs[i];
740  if (word_res->reject_map[i].accepted())
741  noise_score[i] = non_noise_limit;
742  else
743  noise_score[i] = blob_noise_score(blob);
744 
745  if (debug_fix_space_level > 5)
746  tprintf("%1.1f ", noise_score[i]);
747  }
748  if (debug_fix_space_level > 5)
749  tprintf("\n");
750 
751  /* Now find the worst one which is far enough away from the end of the word */
752 
753  non_noise_count = 0;
754  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
755  if (noise_score[i] >= non_noise_limit) {
756  non_noise_count++;
757  }
758  }
759  if (non_noise_count < fixsp_non_noise_limit)
760  return -1;
761 
762  min_noise_blob = i;
763 
764  non_noise_count = 0;
765  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
766  i--) {
767  if (noise_score[i] >= non_noise_limit) {
768  non_noise_count++;
769  }
770  }
771  if (non_noise_count < fixsp_non_noise_limit)
772  return -1;
773 
774  max_noise_blob = i;
775 
776  if (min_noise_blob > max_noise_blob)
777  return -1;
778 
779  *worst_noise_score = small_limit;
780  worst_noise_blob = -1;
781  for (i = min_noise_blob; i <= max_noise_blob; i++) {
782  if (noise_score[i] < *worst_noise_score) {
783  worst_noise_blob = i;
784  *worst_noise_score = noise_score[i];
785  }
786  }
787  return worst_noise_blob;
788 }
TWERD * rebuild_word
Definition: pageres.h:260
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
int NumBlobs() const
Definition: blobs.h:432
const int kBlnXHeight
Definition: normalis.h:24
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:790
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:710
int length() const
Definition: boxword.h:83
const STRING & unichar_string() const
Definition: ratngs.h:541
Definition: blobs.h:268
WERD_CHOICE * best_choice
Definition: pageres.h:235
tesseract::BoxWord * box_word
Definition: pageres.h:266
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ write_results()

void tesseract::Tesseract::write_results ( PAGE_RES_IT page_res_it,
char  newline_type,
bool  force_eol 
)

Definition at line 105 of file output.cpp.

107  { // override tilde crunch?
108  WERD_RES *word = page_res_it.word();
109  const UNICHARSET &uchset = *word->uch_set;
110  int i;
111  bool need_reject = false;
112  UNICHAR_ID space = uchset.unichar_to_id(" ");
113 
114  if ((word->unlv_crunch_mode != CR_NONE ||
115  word->best_choice->length() == 0) &&
117  if ((word->unlv_crunch_mode != CR_DELETE) &&
118  (!stats_.tilde_crunch_written ||
119  ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
120  (word->word->space () > 0) &&
121  !word->word->flag (W_FUZZY_NON) &&
122  !word->word->flag (W_FUZZY_SP)))) {
123  if (!word->word->flag (W_BOL) &&
124  (word->word->space () > 0) &&
125  !word->word->flag (W_FUZZY_NON) &&
126  !word->word->flag (W_FUZZY_SP)) {
127  stats_.last_char_was_tilde = false;
128  }
129  need_reject = true;
130  }
131  if ((need_reject && !stats_.last_char_was_tilde) ||
132  (force_eol && stats_.write_results_empty_block)) {
133  /* Write a reject char - mark as rejected unless zero_rejection mode */
134  stats_.last_char_was_tilde = TRUE;
135  stats_.tilde_crunch_written = true;
136  stats_.last_char_was_newline = false;
137  stats_.write_results_empty_block = false;
138  }
139 
140  if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
141  stats_.tilde_crunch_written = false;
142  stats_.last_char_was_newline = true;
143  stats_.last_char_was_tilde = false;
144  }
145 
146  if (force_eol)
147  stats_.write_results_empty_block = true;
148  return;
149  }
150 
151  /* NORMAL PROCESSING of non tilde crunched words */
152 
153  stats_.tilde_crunch_written = false;
154  if (newline_type)
155  stats_.last_char_was_newline = true;
156  else
157  stats_.last_char_was_newline = false;
158  stats_.write_results_empty_block = force_eol; // about to write a real word
159 
160  if (unlv_tilde_crunching &&
161  stats_.last_char_was_tilde &&
162  (word->word->space() == 0) &&
164  (word->best_choice->unichar_id(0) == space)) {
165  /* Prevent adjacent tilde across words - we know that adjacent tildes within
166  words have been removed */
167  word->MergeAdjacentBlobs(0);
168  }
169  if (newline_type ||
171  stats_.last_char_was_tilde = false;
172  else {
173  if (word->reject_map.length () > 0) {
174  if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
175  stats_.last_char_was_tilde = true;
176  else
177  stats_.last_char_was_tilde = false;
178  }
179  else if (word->word->space () > 0)
180  stats_.last_char_was_tilde = false;
181  /* else it is unchanged as there are no output chars */
182  }
183 
184  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
185 
186  set_unlv_suspects(word);
187  check_debug_pt (word, 120);
189  tprintf ("Dict word: \"%s\": %d\n",
190  word->best_choice->debug_string().string(),
191  dict_word(*(word->best_choice)));
192  }
193  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
195  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
196  for (i = 0; i < word->best_choice->length(); ++i) {
197  if (word->reject_map[i].rejected())
198  word->reject_map[i].setrej_minimal_rej_accept();
199  }
200  }
202  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
203  for (i = 0; i < word->best_choice->length(); ++i) {
204  if ((word->best_choice->unichar_id(i) != space) &&
205  word->reject_map[i].rejected())
206  word->reject_map[i].setrej_minimal_rej_accept();
207  }
208  }
209  }
210 }
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:280
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:980
int UNICHAR_ID
Definition: unichar.h:35
#define TRUE
Definition: capi.h:51
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:129
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
int32_t length() const
Definition: rejctmap.h:223
Definition: werd.h:35
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
uint8_t space()
Definition: werd.h:102
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:310
WERD_RES * word() const
Definition: pageres.h:751
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: werd.h:34
int length() const
Definition: ratngs.h:303
const STRING debug_string() const
Definition: ratngs.h:505
const UNICHARSET * uch_set
Definition: pageres.h:206
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868
WERD_CHOICE * best_choice
Definition: pageres.h:235
#define ASSERT_HOST(x)
Definition: errcode.h:84
WERD * word
Definition: pageres.h:189

Member Data Documentation

◆ applybox_debug

int tesseract::Tesseract::applybox_debug = 1

"Debug level"

Definition at line 850 of file tesseractclass.h.

◆ applybox_exposure_pattern

char* tesseract::Tesseract::applybox_exposure_pattern = ".exp"

"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"

Definition at line 855 of file tesseractclass.h.

◆ applybox_learn_chars_and_char_frags_mode

bool tesseract::Tesseract::applybox_learn_chars_and_char_frags_mode = false

"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."

Definition at line 859 of file tesseractclass.h.

◆ applybox_learn_ngrams_mode

bool tesseract::Tesseract::applybox_learn_ngrams_mode = false

"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."

Definition at line 862 of file tesseractclass.h.

◆ applybox_page

int tesseract::Tesseract::applybox_page = 0

"Page number to apply boxes from"

Definition at line 851 of file tesseractclass.h.

◆ bidi_debug

int tesseract::Tesseract::bidi_debug = 0

"Debug level for BiDi"

Definition at line 849 of file tesseractclass.h.

◆ bland_unrej

bool tesseract::Tesseract::bland_unrej = false

"unrej potential with no checks"

Definition at line 963 of file tesseractclass.h.

◆ chs_leading_punct

char* tesseract::Tesseract::chs_leading_punct = "('`\""

"Leading punctuation"

Definition at line 902 of file tesseractclass.h.

◆ chs_trailing_punct1

char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!"

"1st Trailing punctuation"

Definition at line 903 of file tesseractclass.h.

◆ chs_trailing_punct2

char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\""

"2nd Trailing punctuation"

Definition at line 904 of file tesseractclass.h.

◆ conflict_set_I_l_1

char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]"

"Il1 conflict set"

Definition at line 1083 of file tesseractclass.h.

◆ crunch_accept_ok

bool tesseract::Tesseract::crunch_accept_ok = true

"Use acceptability in okstring"

Definition at line 992 of file tesseractclass.h.

◆ crunch_debug

int tesseract::Tesseract::crunch_debug = 0

"As it says"

Definition at line 1001 of file tesseractclass.h.

◆ crunch_del_cert

double tesseract::Tesseract::crunch_del_cert = -10.0

"POTENTIAL crunch cert lt this"

Definition at line 981 of file tesseractclass.h.

◆ crunch_del_high_word

double tesseract::Tesseract::crunch_del_high_word = 1.5

"Del if word gt xht x this above bl"

Definition at line 986 of file tesseractclass.h.

◆ crunch_del_low_word

double tesseract::Tesseract::crunch_del_low_word = 0.5

"Del if word gt xht x this below bl"

Definition at line 987 of file tesseractclass.h.

◆ crunch_del_max_ht

double tesseract::Tesseract::crunch_del_max_ht = 3.0

"Del if word ht gt xht x this"

Definition at line 983 of file tesseractclass.h.

◆ crunch_del_min_ht

double tesseract::Tesseract::crunch_del_min_ht = 0.7

"Del if word ht lt xht x this"

Definition at line 982 of file tesseractclass.h.

◆ crunch_del_min_width

double tesseract::Tesseract::crunch_del_min_width = 3.0

"Del if word width lt xht x this"

Definition at line 984 of file tesseractclass.h.

◆ crunch_del_rating

double tesseract::Tesseract::crunch_del_rating = 60

"POTENTIAL crunch rating lt this"

Definition at line 980 of file tesseractclass.h.

◆ crunch_early_convert_bad_unlv_chs

bool tesseract::Tesseract::crunch_early_convert_bad_unlv_chs = false

"Take out ~^ early?"

Definition at line 971 of file tesseractclass.h.

◆ crunch_early_merge_tess_fails

bool tesseract::Tesseract::crunch_early_merge_tess_fails = true

"Before word crunch?"

Definition at line 970 of file tesseractclass.h.

◆ crunch_include_numerals

bool tesseract::Tesseract::crunch_include_numerals = false

"Fiddle alpha figures"

Definition at line 995 of file tesseractclass.h.

◆ crunch_leave_accept_strings

bool tesseract::Tesseract::crunch_leave_accept_strings = false

"Don't pot crunch sensible strings"

Definition at line 994 of file tesseractclass.h.

◆ crunch_leave_lc_strings

int tesseract::Tesseract::crunch_leave_lc_strings = 4

"Don't crunch words with long lower case strings"

Definition at line 997 of file tesseractclass.h.

◆ crunch_leave_ok_strings

bool tesseract::Tesseract::crunch_leave_ok_strings = true

"Don't touch sensible strings"

Definition at line 991 of file tesseractclass.h.

◆ crunch_leave_uc_strings

int tesseract::Tesseract::crunch_leave_uc_strings = 4

"Don't crunch words with long lower case strings"

Definition at line 999 of file tesseractclass.h.

◆ crunch_long_repetitions

int tesseract::Tesseract::crunch_long_repetitions = 3

"Crunch words with long repetitions"

Definition at line 1000 of file tesseractclass.h.

◆ crunch_poor_garbage_cert

double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0

"crunch garbage cert lt this"

Definition at line 975 of file tesseractclass.h.

◆ crunch_poor_garbage_rate

double tesseract::Tesseract::crunch_poor_garbage_rate = 60

"crunch garbage rating lt this"

Definition at line 976 of file tesseractclass.h.

◆ crunch_pot_garbage

bool tesseract::Tesseract::crunch_pot_garbage = true

"POTENTIAL crunch garbage"

Definition at line 979 of file tesseractclass.h.

◆ crunch_pot_indicators

int tesseract::Tesseract::crunch_pot_indicators = 1

"How many potential indicators needed"

Definition at line 990 of file tesseractclass.h.

◆ crunch_pot_poor_cert

double tesseract::Tesseract::crunch_pot_poor_cert = -8.0

"POTENTIAL crunch cert lt this"

Definition at line 978 of file tesseractclass.h.

◆ crunch_pot_poor_rate

double tesseract::Tesseract::crunch_pot_poor_rate = 40

"POTENTIAL crunch rating lt this"

Definition at line 977 of file tesseractclass.h.

◆ crunch_rating_max

int tesseract::Tesseract::crunch_rating_max = 10

"For adj length in rating per ch"

Definition at line 989 of file tesseractclass.h.

◆ crunch_small_outlines_size

double tesseract::Tesseract::crunch_small_outlines_size = 0.6

"Small if lt xht x this"

Definition at line 988 of file tesseractclass.h.

◆ crunch_terrible_garbage

bool tesseract::Tesseract::crunch_terrible_garbage = true

"As it says"

Definition at line 973 of file tesseractclass.h.

◆ crunch_terrible_rating

double tesseract::Tesseract::crunch_terrible_rating = 80.0

"crunch rating lt this"

Definition at line 972 of file tesseractclass.h.

◆ debug_acceptable_wds

bool tesseract::Tesseract::debug_acceptable_wds = false

"Dump word pass/fail chk"

Definition at line 901 of file tesseractclass.h.

◆ debug_fix_space_level

int tesseract::Tesseract::debug_fix_space_level = 0

"Contextual fixspace debug"

Definition at line 1007 of file tesseractclass.h.

◆ debug_noise_removal

int tesseract::Tesseract::debug_noise_removal = 0

"Debug reassignment of small outlines"

Definition at line 885 of file tesseractclass.h.

◆ debug_x_ht_level

int tesseract::Tesseract::debug_x_ht_level = 0

"Reestimate debug"

Definition at line 900 of file tesseractclass.h.

◆ docqual_excuse_outline_errs

bool tesseract::Tesseract::docqual_excuse_outline_errs = false

"Allow outline errs in unrejection?"

Definition at line 931 of file tesseractclass.h.

◆ enable_noise_removal

bool tesseract::Tesseract::enable_noise_removal = true

"Remove and conditionally reassign small outlines when they" " confuse layout analysis, determining diacritics vs noise"

Definition at line 884 of file tesseractclass.h.

◆ file_type

char* tesseract::Tesseract::file_type = ".tif"

"Filename extension"

Definition at line 1090 of file tesseractclass.h.

◆ fixsp_done_mode

int tesseract::Tesseract::fixsp_done_mode = 1

"What constitues done for spacing"

Definition at line 1006 of file tesseractclass.h.

◆ fixsp_non_noise_limit

int tesseract::Tesseract::fixsp_non_noise_limit = 1

"How many non-noise blbs either side?"

Definition at line 1003 of file tesseractclass.h.

◆ fixsp_small_outlines_size

double tesseract::Tesseract::fixsp_small_outlines_size = 0.28

"Small if lt xht x this"

Definition at line 1004 of file tesseractclass.h.

◆ hocr_font_info

bool tesseract::Tesseract::hocr_font_info = false

"Add font info to hocr output"

Definition at line 969 of file tesseractclass.h.

◆ interactive_display_mode

bool tesseract::Tesseract::interactive_display_mode = false

"Run interactively?"

Definition at line 1089 of file tesseractclass.h.

◆ jpg_quality

int tesseract::Tesseract::jpg_quality = 85

"Set JPEG quality level"

Definition at line 1044 of file tesseractclass.h.

◆ lstm_choice_mode

int tesseract::Tesseract::lstm_choice_mode = 0

"Allows to include alternative symbols choices in the hOCR output. " "Valid input values are 0, 1 and 2. 0 is the default value. " "With 1 the alternative symbol choices per timestep are included. " "With 2 the alternative symbol choices are accumulated per character."

Definition at line 1125 of file tesseractclass.h.

◆ lstm_use_matrix

bool tesseract::Tesseract::lstm_use_matrix = 1

"Use ratings matrix/beam searct with lstm"

Definition at line 927 of file tesseractclass.h.

◆ min_characters_to_try

int tesseract::Tesseract::min_characters_to_try = 50

"Specify minimum characters to try during OSD"

Definition at line 1047 of file tesseractclass.h.

◆ min_orientation_margin

double tesseract::Tesseract::min_orientation_margin = 7.0

"Min acceptable orientation margin"

Definition at line 1099 of file tesseractclass.h.

◆ min_sane_x_ht_pixels

int tesseract::Tesseract::min_sane_x_ht_pixels = 8

"Reject any x-ht lt or eq than this"

Definition at line 1084 of file tesseractclass.h.

◆ multilang_debug_level

int tesseract::Tesseract::multilang_debug_level = 0

"Print multilang debug info."

Definition at line 922 of file tesseractclass.h.

◆ noise_cert_basechar

double tesseract::Tesseract::noise_cert_basechar = -8.0

"Hingepoint for base char certainty"

Definition at line 888 of file tesseractclass.h.

◆ noise_cert_disjoint

double tesseract::Tesseract::noise_cert_disjoint = -2.5

"Hingepoint for disjoint certainty"

Definition at line 891 of file tesseractclass.h.

◆ noise_cert_factor

double tesseract::Tesseract::noise_cert_factor = 0.375

"Scaling on certainty diff from Hingepoint"

Definition at line 897 of file tesseractclass.h.

◆ noise_cert_punc

double tesseract::Tesseract::noise_cert_punc = -2.5

"Threshold for new punc char certainty"

Definition at line 894 of file tesseractclass.h.

◆ noise_maxperblob

int tesseract::Tesseract::noise_maxperblob = 8

"Max diacritics to apply to a blob"

Definition at line 898 of file tesseractclass.h.

◆ noise_maxperword

int tesseract::Tesseract::noise_maxperword = 16

"Max diacritics to apply to a word"

Definition at line 899 of file tesseractclass.h.

◆ numeric_punctuation

char* tesseract::Tesseract::numeric_punctuation = ".,"

"Punct. chs expected WITHIN numbers"

Definition at line 1009 of file tesseractclass.h.

◆ ocr_devanagari_split_strategy

int tesseract::Tesseract::ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."

Definition at line 844 of file tesseractclass.h.

◆ ok_repeated_ch_non_alphanum_wds

char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075"

"Allow NN to unrej"

Definition at line 1082 of file tesseractclass.h.

◆ outlines_2

char* tesseract::Tesseract::outlines_2 = "ij!?%\":;"

"Non standard number of outlines"

Definition at line 929 of file tesseractclass.h.

◆ outlines_odd

char* tesseract::Tesseract::outlines_odd = "%| "

"Non standard number of outlines"

Definition at line 928 of file tesseractclass.h.

◆ page_separator

char* tesseract::Tesseract::page_separator = "\f"

"Page separator (default is form feed control character)"

Definition at line 1120 of file tesseractclass.h.

◆ pageseg_devanagari_split_strategy

int tesseract::Tesseract::pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."

Definition at line 840 of file tesseractclass.h.

◆ paragraph_debug_level

int tesseract::Tesseract::paragraph_debug_level = 0

"Print paragraph debug info."

Definition at line 923 of file tesseractclass.h.

◆ paragraph_text_based

bool tesseract::Tesseract::paragraph_text_based = true

"Run paragraph detection on the post-text-recognition " "(more accurate)"

Definition at line 926 of file tesseractclass.h.

◆ poly_allow_detailed_fx

bool tesseract::Tesseract::poly_allow_detailed_fx = false

"Allow feature extractors to see the original outline"

Definition at line 1103 of file tesseractclass.h.

◆ preserve_interword_spaces

bool tesseract::Tesseract::preserve_interword_spaces = false

"Preserve multiple interword spaces"

Definition at line 1118 of file tesseractclass.h.

◆ quality_blob_pc

double tesseract::Tesseract::quality_blob_pc = 0.0

"good_quality_doc gte good blobs limit"

Definition at line 906 of file tesseractclass.h.

◆ quality_char_pc

double tesseract::Tesseract::quality_char_pc = 0.95

"good_quality_doc gte good char limit"

Definition at line 909 of file tesseractclass.h.

◆ quality_min_initial_alphas_reqd

int tesseract::Tesseract::quality_min_initial_alphas_reqd = 2

"alphas in a good word"

Definition at line 910 of file tesseractclass.h.

◆ quality_outline_pc

double tesseract::Tesseract::quality_outline_pc = 1.0

"good_quality_doc lte outline error limit"

Definition at line 908 of file tesseractclass.h.

◆ quality_rej_pc

double tesseract::Tesseract::quality_rej_pc = 0.08

"good_quality_doc lte rejection limit"

Definition at line 905 of file tesseractclass.h.

◆ quality_rowrej_pc

double tesseract::Tesseract::quality_rowrej_pc = 1.1

"good_quality_doc gte good char limit"

Definition at line 965 of file tesseractclass.h.

◆ rej_1Il_trust_permuter_type

bool tesseract::Tesseract::rej_1Il_trust_permuter_type = true

"Don't double check"

Definition at line 1073 of file tesseractclass.h.

◆ rej_1Il_use_dict_word

bool tesseract::Tesseract::rej_1Il_use_dict_word = false

"Use dictword test"

Definition at line 1072 of file tesseractclass.h.

◆ rej_alphas_in_number_perm

bool tesseract::Tesseract::rej_alphas_in_number_perm = false

"Extend permuter check"

Definition at line 1078 of file tesseractclass.h.

◆ rej_trust_doc_dawg

bool tesseract::Tesseract::rej_trust_doc_dawg = false

"Use DOC dawg in 11l conf. detector"

Definition at line 1071 of file tesseractclass.h.

◆ rej_use_good_perm

bool tesseract::Tesseract::rej_use_good_perm = true

"Individual rejection control"

Definition at line 1076 of file tesseractclass.h.

◆ rej_use_sensible_wd

bool tesseract::Tesseract::rej_use_sensible_wd = false

"Extend permuter check"

Definition at line 1077 of file tesseractclass.h.

◆ rej_use_tess_accepted

bool tesseract::Tesseract::rej_use_tess_accepted = true

"Individual rejection control"

Definition at line 1074 of file tesseractclass.h.

◆ rej_use_tess_blanks

bool tesseract::Tesseract::rej_use_tess_blanks = true

"Individual rejection control"

Definition at line 1075 of file tesseractclass.h.

◆ rej_whole_of_mostly_reject_word_fract

double tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract = 0.85

"if >this fract"

Definition at line 1079 of file tesseractclass.h.

◆ subscript_max_y_top

double tesseract::Tesseract::subscript_max_y_top = 0.5

"Maximum top of a character measured as a multiple of x-height " "above the baseline for us to reconsider whether it's a " "subscript."

Definition at line 1028 of file tesseractclass.h.

◆ superscript_bettered_certainty

double tesseract::Tesseract::superscript_bettered_certainty = 0.97

"What reduction in " "badness do we think sufficient to choose a superscript over " "what we'd thought. For example, a value of 0.6 means we want " "to reduce badness of certainty by 40%"

Definition at line 1020 of file tesseractclass.h.

◆ superscript_debug

int tesseract::Tesseract::superscript_debug = 0

"Debug level for sub & superscript fixer"

Definition at line 1013 of file tesseractclass.h.

◆ superscript_min_y_bottom

double tesseract::Tesseract::superscript_min_y_bottom = 0.3

"Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether it's " "a superscript."

Definition at line 1032 of file tesseractclass.h.

◆ superscript_scaledown_ratio

double tesseract::Tesseract::superscript_scaledown_ratio = 0.4

"A superscript scaled down more than this is unbelievably " "small. For example, 0.3 means we expect the font size to " "be no smaller than 30% of the text line font size."

Definition at line 1024 of file tesseractclass.h.

◆ superscript_worse_certainty

double tesseract::Tesseract::superscript_worse_certainty = 2.0

"How many times worse " "certainty does a superscript position glyph need to be for us " "to try classifying it as a char with a different baseline?"

Definition at line 1016 of file tesseractclass.h.

◆ suspect_accept_rating

double tesseract::Tesseract::suspect_accept_rating = -999.9

"Accept good rating limit"

Definition at line 1056 of file tesseractclass.h.

◆ suspect_constrain_1Il

bool tesseract::Tesseract::suspect_constrain_1Il = false

"UNLV keep 1Il chars rejected"

Definition at line 1054 of file tesseractclass.h.

◆ suspect_level

int tesseract::Tesseract::suspect_level = 99

"Suspect marker level"

Definition at line 1050 of file tesseractclass.h.

◆ suspect_rating_per_ch

double tesseract::Tesseract::suspect_rating_per_ch = 999.9

"Don't touch bad rating limit"

Definition at line 1055 of file tesseractclass.h.

◆ suspect_short_words

int tesseract::Tesseract::suspect_short_words = 2

"Don't Suspect dict wds longer than this"

Definition at line 1053 of file tesseractclass.h.

◆ suspect_space_level

int tesseract::Tesseract::suspect_space_level = 100

"Min suspect level for rejecting spaces"

Definition at line 1052 of file tesseractclass.h.

◆ tessedit_adaption_debug

bool tesseract::Tesseract::tessedit_adaption_debug = false

"Generate and print debug information for adaption"

Definition at line 848 of file tesseractclass.h.

◆ tessedit_ambigs_training

bool tesseract::Tesseract::tessedit_ambigs_training = false

"Perform training for ambiguities"

Definition at line 836 of file tesseractclass.h.

◆ tessedit_bigram_debug

int tesseract::Tesseract::tessedit_bigram_debug = 0

"Amount of debug output for bigram " "correction."

Definition at line 881 of file tesseractclass.h.

◆ tessedit_char_blacklist

char* tesseract::Tesseract::tessedit_char_blacklist = ""

"Blacklist of chars not to recognize"

Definition at line 830 of file tesseractclass.h.

◆ tessedit_char_unblacklist

char* tesseract::Tesseract::tessedit_char_unblacklist = ""

"List of chars to override tessedit_char_blacklist"

Definition at line 834 of file tesseractclass.h.

◆ tessedit_char_whitelist

char* tesseract::Tesseract::tessedit_char_whitelist = ""

"Whitelist of chars to recognize"

Definition at line 832 of file tesseractclass.h.

◆ tessedit_consistent_reps

bool tesseract::Tesseract::tessedit_consistent_reps = true

"Force all rep chars the same"

Definition at line 1063 of file tesseractclass.h.

◆ tessedit_create_boxfile

bool tesseract::Tesseract::tessedit_create_boxfile = false

"Output text with boxes"

Definition at line 1085 of file tesseractclass.h.

◆ tessedit_create_hocr

bool tesseract::Tesseract::tessedit_create_hocr = false

"Write .html hOCR output file"

Definition at line 1039 of file tesseractclass.h.

◆ tessedit_create_pdf

bool tesseract::Tesseract::tessedit_create_pdf = false

"Write .pdf output file"

Definition at line 1041 of file tesseractclass.h.

◆ tessedit_create_tsv

bool tesseract::Tesseract::tessedit_create_tsv = false

"Write .tsv output file"

Definition at line 1040 of file tesseractclass.h.

◆ tessedit_create_txt

bool tesseract::Tesseract::tessedit_create_txt = false

"Write .txt output file"

Definition at line 1038 of file tesseractclass.h.

◆ tessedit_debug_block_rejection

bool tesseract::Tesseract::tessedit_debug_block_rejection = false

"Block and Row stats"

Definition at line 875 of file tesseractclass.h.

◆ tessedit_debug_doc_rejection

bool tesseract::Tesseract::tessedit_debug_doc_rejection = false

"Page stats"

Definition at line 960 of file tesseractclass.h.

◆ tessedit_debug_fonts

bool tesseract::Tesseract::tessedit_debug_fonts = false

"Output font info per char"

Definition at line 874 of file tesseractclass.h.

◆ tessedit_debug_quality_metrics

bool tesseract::Tesseract::tessedit_debug_quality_metrics = false

"Output data to debug file"

Definition at line 962 of file tesseractclass.h.

◆ tessedit_display_outwords

bool tesseract::Tesseract::tessedit_display_outwords = false

"Draw output words"

Definition at line 863 of file tesseractclass.h.

◆ tessedit_dont_blkrej_good_wds

bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 949 of file tesseractclass.h.

◆ tessedit_dont_rowrej_good_wds

bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 951 of file tesseractclass.h.

◆ tessedit_dump_choices

bool tesseract::Tesseract::tessedit_dump_choices = false

"Dump char choices"

Definition at line 864 of file tesseractclass.h.

◆ tessedit_dump_pageseg_images

bool tesseract::Tesseract::tessedit_dump_pageseg_images = false

"Dump intermediate images made during page segmentation"

Definition at line 821 of file tesseractclass.h.

◆ tessedit_enable_bigram_correction

bool tesseract::Tesseract::tessedit_enable_bigram_correction = true

"Enable correction based on the word bigram dictionary."

Definition at line 877 of file tesseractclass.h.

◆ tessedit_enable_dict_correction

bool tesseract::Tesseract::tessedit_enable_dict_correction = false

"Enable single word correction based on the dictionary."

Definition at line 879 of file tesseractclass.h.

◆ tessedit_enable_doc_dict

bool tesseract::Tesseract::tessedit_enable_doc_dict = true

"Add words to the document dictionary"

Definition at line 873 of file tesseractclass.h.

◆ tessedit_fix_fuzzy_spaces

bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true

"Try to improve fuzzy spaces"

Definition at line 867 of file tesseractclass.h.

◆ tessedit_fix_hyphens

bool tesseract::Tesseract::tessedit_fix_hyphens = true

"Crunch double hyphens?"

Definition at line 870 of file tesseractclass.h.

◆ tessedit_flip_0O

bool tesseract::Tesseract::tessedit_flip_0O = true

"Contextual 0O O0 flips"

Definition at line 1066 of file tesseractclass.h.

◆ tessedit_good_doc_still_rowrej_wd

double tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd = 1.1

"rej good doc wd if more than this fraction rejected"

Definition at line 957 of file tesseractclass.h.

◆ tessedit_good_quality_unrej

bool tesseract::Tesseract::tessedit_good_quality_unrej = true

"Reduce rejection on good docs"

Definition at line 933 of file tesseractclass.h.

◆ tessedit_image_border

int tesseract::Tesseract::tessedit_image_border = 2

"Rej blbs near image edge limit"

Definition at line 1080 of file tesseractclass.h.

◆ tessedit_init_config_only

bool tesseract::Tesseract::tessedit_init_config_only = false

"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis."

Definition at line 1106 of file tesseractclass.h.

◆ tessedit_load_sublangs

char* tesseract::Tesseract::tessedit_load_sublangs = ""

"List of languages to load with this one"

Definition at line 1093 of file tesseractclass.h.

◆ tessedit_lower_flip_hyphen

double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5

"Aspect ratio dot/hyphen test"

Definition at line 1068 of file tesseractclass.h.

◆ tessedit_make_boxes_from_boxes

bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false

"Generate more boxes from boxed chars"

Definition at line 817 of file tesseractclass.h.

◆ tessedit_matcher_log

bool tesseract::Tesseract::tessedit_matcher_log = false

"Log matcher activity"

Definition at line 916 of file tesseractclass.h.

◆ tessedit_minimal_rej_pass1

bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false

"Do minimal rejection on pass 1 output"

Definition at line 914 of file tesseractclass.h.

◆ tessedit_minimal_rejection

bool tesseract::Tesseract::tessedit_minimal_rejection = false

"Only reject tess failures"

Definition at line 1057 of file tesseractclass.h.

◆ tessedit_ocr_engine_mode

int tesseract::Tesseract::tessedit_ocr_engine_mode = tesseract::OEM_DEFAULT

"Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" " to loading and running the most accurate available."

Definition at line 828 of file tesseractclass.h.

◆ tessedit_override_permuter

bool tesseract::Tesseract::tessedit_override_permuter = true

"According to dict_word"

Definition at line 1091 of file tesseractclass.h.

◆ tessedit_page_number

int tesseract::Tesseract::tessedit_page_number = -1

"-1 -> All pages, else specific page to process"

Definition at line 1087 of file tesseractclass.h.

◆ tessedit_pageseg_mode

int tesseract::Tesseract::tessedit_pageseg_mode = PSM_SINGLE_BLOCK

"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)"

Definition at line 825 of file tesseractclass.h.

◆ tessedit_parallelize

int tesseract::Tesseract::tessedit_parallelize = 0

"Run in parallel where possible"

Definition at line 1116 of file tesseractclass.h.

◆ tessedit_prefer_joined_punct

bool tesseract::Tesseract::tessedit_prefer_joined_punct = false

"Reward punctuation joins"

Definition at line 1005 of file tesseractclass.h.

◆ tessedit_preserve_blk_rej_perfect_wds

bool tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds = true

"Only rej partially rejected words in block rejection"

Definition at line 945 of file tesseractclass.h.

◆ tessedit_preserve_min_wd_len

int tesseract::Tesseract::tessedit_preserve_min_wd_len = 2

"Only preserve wds longer than this"

Definition at line 953 of file tesseractclass.h.

◆ tessedit_preserve_row_rej_perfect_wds

bool tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds = true

"Only rej partially rejected words in row rejection"

Definition at line 947 of file tesseractclass.h.

◆ tessedit_redo_xheight

bool tesseract::Tesseract::tessedit_redo_xheight = true

"Check/Correct x-height"

Definition at line 871 of file tesseractclass.h.

◆ tessedit_reject_bad_qual_wds

bool tesseract::Tesseract::tessedit_reject_bad_qual_wds = true

"Reject all bad quality wds"

Definition at line 959 of file tesseractclass.h.

◆ tessedit_reject_block_percent

double tesseract::Tesseract::tessedit_reject_block_percent = 45.00

"%rej allowed before rej whole block"

Definition at line 938 of file tesseractclass.h.

◆ tessedit_reject_doc_percent

double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00

"%rej allowed before rej whole doc"

Definition at line 936 of file tesseractclass.h.

◆ tessedit_reject_mode

int tesseract::Tesseract::tessedit_reject_mode = 0

"Rejection algorithm"

Definition at line 1064 of file tesseractclass.h.

◆ tessedit_reject_row_percent

double tesseract::Tesseract::tessedit_reject_row_percent = 40.00

"%rej allowed before rej whole row"

Definition at line 940 of file tesseractclass.h.

◆ tessedit_rejection_debug

bool tesseract::Tesseract::tessedit_rejection_debug = false

"Adaption debug"

Definition at line 1065 of file tesseractclass.h.

◆ tessedit_resegment_from_boxes

bool tesseract::Tesseract::tessedit_resegment_from_boxes = false

"Take segmentation and labeling from box file"

Definition at line 811 of file tesseractclass.h.

◆ tessedit_resegment_from_line_boxes

bool tesseract::Tesseract::tessedit_resegment_from_line_boxes = false

"Conversion of word/line box file to char box file"

Definition at line 813 of file tesseractclass.h.

◆ tessedit_row_rej_good_docs

bool tesseract::Tesseract::tessedit_row_rej_good_docs = true

"Apply row rejection to good docs"

Definition at line 955 of file tesseractclass.h.

◆ tessedit_tess_adaption_mode

int tesseract::Tesseract::tessedit_tess_adaption_mode = 0x27

"Adaptation decision algorithm for tess"

Definition at line 912 of file tesseractclass.h.

◆ tessedit_test_adaption

bool tesseract::Tesseract::tessedit_test_adaption = false

"Test adaption criteria"

Definition at line 915 of file tesseractclass.h.

◆ tessedit_test_adaption_mode

int tesseract::Tesseract::tessedit_test_adaption_mode = 3

"Adaptation decision algorithm for tess"

Definition at line 918 of file tesseractclass.h.

◆ tessedit_timing_debug

bool tesseract::Tesseract::tessedit_timing_debug = false

"Print timing stats"

Definition at line 865 of file tesseractclass.h.

◆ tessedit_train_from_boxes

bool tesseract::Tesseract::tessedit_train_from_boxes = false

"Generate training data from boxed chars"

Definition at line 815 of file tesseractclass.h.

◆ tessedit_train_line_recognizer

bool tesseract::Tesseract::tessedit_train_line_recognizer = false

"Break input into lines and remap boxes if present"

Definition at line 819 of file tesseractclass.h.

◆ tessedit_unrej_any_wd

bool tesseract::Tesseract::tessedit_unrej_any_wd = false

"Don't bother with word plausibility"

Definition at line 869 of file tesseractclass.h.

◆ tessedit_upper_flip_hyphen

double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8

"Aspect ratio dot/hyphen test"

Definition at line 1070 of file tesseractclass.h.

◆ tessedit_use_primary_params_model

bool tesseract::Tesseract::tessedit_use_primary_params_model = false

"In multilingual mode use params model of the primary language"

Definition at line 1095 of file tesseractclass.h.

◆ tessedit_use_reject_spaces

bool tesseract::Tesseract::tessedit_use_reject_spaces = true

"Reject spaces?"

Definition at line 934 of file tesseractclass.h.

◆ tessedit_whole_wd_rej_row_percent

double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00

"Number of row rejects in whole word rejects" "which prevents whole row rejection"

Definition at line 943 of file tesseractclass.h.

◆ tessedit_word_for_word

bool tesseract::Tesseract::tessedit_word_for_word = false

"Make output have exactly one word per WERD"

Definition at line 1060 of file tesseractclass.h.

◆ tessedit_write_block_separators

bool tesseract::Tesseract::tessedit_write_block_separators = false

"Write block separators in output"

Definition at line 1034 of file tesseractclass.h.

◆ tessedit_write_images

bool tesseract::Tesseract::tessedit_write_images = false

"Capture the image from the IPE"

Definition at line 1088 of file tesseractclass.h.

◆ tessedit_write_params_to_file

char* tesseract::Tesseract::tessedit_write_params_to_file = ""

"Write all parameters to the given file."

Definition at line 846 of file tesseractclass.h.

◆ tessedit_write_rep_codes

bool tesseract::Tesseract::tessedit_write_rep_codes = false

"Write repetition char code"

Definition at line 1036 of file tesseractclass.h.

◆ tessedit_write_unlv

bool tesseract::Tesseract::tessedit_write_unlv = false

"Write .unlv output file"

Definition at line 1037 of file tesseractclass.h.

◆ tessedit_zero_kelvin_rejection

bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false

"Don't reject ANYTHING AT ALL"

Definition at line 1062 of file tesseractclass.h.

◆ tessedit_zero_rejection

bool tesseract::Tesseract::tessedit_zero_rejection = false

"Don't reject ANYTHING"

Definition at line 1058 of file tesseractclass.h.

◆ test_pt

bool tesseract::Tesseract::test_pt = false

"Test for point"

Definition at line 919 of file tesseractclass.h.

◆ test_pt_x

double tesseract::Tesseract::test_pt_x = 99999.99

"xcoord"

Definition at line 920 of file tesseractclass.h.

◆ test_pt_y

double tesseract::Tesseract::test_pt_y = 99999.99

"ycoord"

Definition at line 921 of file tesseractclass.h.

◆ textonly_pdf

bool tesseract::Tesseract::textonly_pdf = false

"Create PDF with only one invisible text layer"

Definition at line 1043 of file tesseractclass.h.

◆ textord_equation_detect

bool tesseract::Tesseract::textord_equation_detect = false

"Turn on equation detector"

Definition at line 1107 of file tesseractclass.h.

◆ textord_tabfind_aligned_gap_fraction

double tesseract::Tesseract::textord_tabfind_aligned_gap_fraction = 0.75

"Fraction of height used as a minimum gap for aligned blobs."

Definition at line 1115 of file tesseractclass.h.

◆ textord_tabfind_force_vertical_text

bool tesseract::Tesseract::textord_tabfind_force_vertical_text = false

"Force using vertical text page mode"

Definition at line 1110 of file tesseractclass.h.

◆ textord_tabfind_show_vlines

bool tesseract::Tesseract::textord_tabfind_show_vlines = false

"Debug line finding"

Definition at line 1100 of file tesseractclass.h.

◆ textord_tabfind_vertical_text

bool tesseract::Tesseract::textord_tabfind_vertical_text = true

"Enable vertical detection"

Definition at line 1108 of file tesseractclass.h.

◆ textord_tabfind_vertical_text_ratio

double tesseract::Tesseract::textord_tabfind_vertical_text_ratio = 0.5

"Fraction of textlines deemed vertical to use vertical page " "mode"

Definition at line 1113 of file tesseractclass.h.

◆ textord_use_cjk_fp_model

bool tesseract::Tesseract::textord_use_cjk_fp_model = FALSE

"Use CJK fixed pitch model"

Definition at line 1101 of file tesseractclass.h.

◆ unlv_tilde_crunching

bool tesseract::Tesseract::unlv_tilde_crunching = false

"Mark v.bad words for tilde crunch"

Definition at line 967 of file tesseractclass.h.

◆ unrecognised_char

char* tesseract::Tesseract::unrecognised_char = "|"

"Output char for unidentified blobs"

Definition at line 1049 of file tesseractclass.h.

◆ user_defined_dpi

int tesseract::Tesseract::user_defined_dpi = 0

"Specify DPI for input image"

Definition at line 1045 of file tesseractclass.h.

◆ x_ht_acceptance_tolerance

int tesseract::Tesseract::x_ht_acceptance_tolerance = 8

"Max allowed deviation of blob top outside of font data"

Definition at line 1011 of file tesseractclass.h.

◆ x_ht_min_change

int tesseract::Tesseract::x_ht_min_change = 8

"Min change in xht before actually trying it"

Definition at line 1012 of file tesseractclass.h.


The documentation for this class was generated from the following files: