All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract::Tesseract Class Reference

#include <tesseractclass.h>

Inheritance diagram for tesseract::Tesseract:
tesseract::Wordrec tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil

Public Member Functions

 Tesseract ()
 
 ~Tesseract ()
 
void Clear ()
 
void ResetAdaptiveClassifier ()
 
void ResetDocumentDictionary ()
 
void SetEquationDetect (EquationDetect *detector)
 
const FCOORDreskew () const
 
Pix ** mutable_pix_binary ()
 
Pix * pix_binary () const
 
Pix * pix_grey () const
 
void set_pix_grey (Pix *grey_pix)
 
Pix * BestPix () const
 
void set_pix_thresholds (Pix *thresholds)
 
int source_resolution () const
 
void set_source_resolution (int ppi)
 
int ImageWidth () const
 
int ImageHeight () const
 
Pix * scaled_color () const
 
int scaled_factor () const
 
void SetScaledColor (int factor, Pix *color)
 
const Textordtextord () const
 
Textordmutable_textord ()
 
bool right_to_left () const
 
int num_sub_langs () const
 
Tesseractget_sub_lang (int index) const
 
bool AnyTessLang () const
 
void SetBlackAndWhitelist ()
 
void PrepareForPageseg ()
 
void PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
 
int SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
 
void SetupWordScripts (BLOCK_LIST *blocks)
 
int AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
 
ColumnFinderSetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
 
void PrerecAllWordsPar (const GenericVector< WordData > &words)
 
bool ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
 
void SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
 
void SetupWordPassN (int pass_n, WordData *word)
 
bool RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
 
bool recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
 
void rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
 
void bigram_correction_pass (PAGE_RES *page_res)
 
void blamer_pass (PAGE_RES *page_res)
 
void script_pos_pass (PAGE_RES *page_res)
 
int RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
 
bool ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
 
void AssignDiacriticsToOverlappingBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
 
void AssignDiacriticsToNewBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
 
bool SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
 
float ClassifyBlobPlusOutlines (const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
 
float ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
 
void classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
 
void classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box)
 
void fix_rep_char (PAGE_RES_IT *page_res_it)
 
ACCEPTABLE_WERD_TYPE acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths)
 
void match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
 
void classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
 
bool RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
 
BOOL8 recog_interactive (PAGE_RES_IT *pr_it)
 
void set_word_fonts (WERD_RES *word)
 
void font_recognition_pass (PAGE_RES *page_res)
 
void dictionary_correction_pass (PAGE_RES *page_res)
 
BOOL8 check_debug_pt (WERD_RES *word, int location)
 
bool SubAndSuperscriptFix (WERD_RES *word_res)
 
void GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
 
WERD_RESTrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
 
bool BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
 
void output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
 
void write_results (PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)
 
void set_unlv_suspects (WERD_RES *word)
 
UNICHAR_ID get_rep_char (WERD_RES *word)
 
BOOL8 acceptable_number_string (const char *s, const char *lengths)
 
inT16 count_alphanums (const WERD_CHOICE &word)
 
inT16 count_alphas (const WERD_CHOICE &word)
 
void read_config_file (const char *filename, SetParamConstraint constraint)
 
int init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
 
int init_tesseract (const char *datapath, const char *language, OcrEngineMode oem)
 
int init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
 
void SetupUniversalFontIds ()
 
int init_tesseract_lm (const char *arg0, const char *textbase, const char *language)
 
void recognize_page (STRING &image_name)
 
void end_tesseract ()
 
bool init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
 
void ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
 
SVMenuNodebuild_menu_new ()
 
void pgeditor_main (int width, int height, PAGE_RES *page_res)
 
void process_image_event (const SVEvent &event)
 
BOOL8 process_cmd_win_event (inT32 cmd_event, char *new_value)
 
void debug_word (PAGE_RES *page_res, const TBOX &selection_box)
 
void do_re_display (BOOL8(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
 
BOOL8 word_display (PAGE_RES_IT *pr_it)
 
BOOL8 word_bln_display (PAGE_RES_IT *pr_it)
 
BOOL8 word_blank_and_set_display (PAGE_RES_IT *pr_its)
 
BOOL8 word_set_display (PAGE_RES_IT *pr_it)
 
BOOL8 word_dumper (PAGE_RES_IT *pr_it)
 
void blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box)
 
void make_reject_map (WERD_RES *word, ROW *row, inT16 pass)
 
BOOL8 one_ell_conflict (WERD_RES *word_res, BOOL8 update_map)
 
inT16 first_alphanum_index (const char *word, const char *word_lengths)
 
inT16 first_alphanum_offset (const char *word, const char *word_lengths)
 
inT16 alpha_count (const char *word, const char *word_lengths)
 
BOOL8 word_contains_non_1_digit (const char *word, const char *word_lengths)
 
void dont_allow_1Il (WERD_RES *word)
 
inT16 count_alphanums (WERD_RES *word)
 
void flip_0O (WERD_RES *word)
 
BOOL8 non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
BOOL8 non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
BOOL8 repeated_nonalphanum_wd (WERD_RES *word, ROW *row)
 
void nn_match_word (WERD_RES *word, ROW *row)
 
void nn_recover_rejects (WERD_RES *word, ROW *row)
 
void set_done (WERD_RES *word, inT16 pass)
 
inT16 safe_dict_word (const WERD_RES *werd_res)
 
void flip_hyphens (WERD_RES *word)
 
void reject_I_1_L (WERD_RES *word)
 
void reject_edge_blobs (WERD_RES *word)
 
void reject_mostly_rejects (WERD_RES *word)
 
BOOL8 word_adaptable (WERD_RES *word, uinT16 mode)
 
void recog_word_recursive (WERD_RES *word)
 
void recog_word (WERD_RES *word)
 
void split_and_recog_word (WERD_RES *word)
 
void split_word (WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
 
void join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
 
void match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block)
 
inT16 fp_eval_word_spacing (WERD_RES_LIST &word_res_list)
 
void dump_words (WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
 
BOOL8 fixspace_thinks_word_done (WERD_RES *word)
 
GARBAGE_LEVEL garbage_word (WERD_RES *word, BOOL8 ok_dict_word)
 
BOOL8 potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
 
void tilde_crunch (PAGE_RES_IT &page_res_it)
 
void unrej_good_quality_words (PAGE_RES_IT &page_res_it)
 
void doc_and_block_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
 
void quality_based_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
 
void convert_bad_unlv_chs (WERD_RES *word_res)
 
void tilde_delete (PAGE_RES_IT &page_res_it)
 
inT16 word_blob_quality (WERD_RES *word, ROW *row)
 
void word_char_quality (WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
 
void unrej_good_chs (WERD_RES *word, ROW *row)
 
inT16 count_outline_errs (char c, inT16 outline_count)
 
inT16 word_outline_errs (WERD_RES *word)
 
BOOL8 terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level)
 
CRUNCH_MODE word_deletable (WERD_RES *word, inT16 &delete_mode)
 
inT16 failure_count (WERD_RES *word)
 
BOOL8 noise_outlines (TWERD *word)
 
void tess_segment_pass_n (int pass_n, WERD_RES *word)
 
PAGE_RESApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
 
void PreenXHeights (BLOCK_LIST *block_list)
 
PAGE_RESSetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
 
void MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
 
bool ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
 
bool ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
 
void ReSegmentByClassification (PAGE_RES *page_res)
 
bool ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
 
bool FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
 
void SearchForText (const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
 
void TidyUp (PAGE_RES *page_res)
 
void ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
 
void CorrectClassifyWords (PAGE_RES *page_res)
 
void ApplyBoxTraining (const STRING &fontname, PAGE_RES *page_res)
 
int CountMisfitTops (WERD_RES *word_res)
 
float ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift)
 
FILE * init_recog_training (const STRING &fname)
 
void recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
 
void ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
 
CubeRecoContextGetCubeRecoContext ()
 
init_cube_objects

Instantiates Tesseract object's CubeRecoContext and TesseractCubeCombiner. Returns false if cube context could not be created or if load_combiner is true, but the combiner could not be loaded.

bool init_cube_objects (bool load_combiner, TessdataManager *tessdata_manager)
 
run_cube_combiner

Iterates through tesseract's results and calls cube on each word, combining the results with the existing tesseract result.

void run_cube_combiner (PAGE_RES *page_res)
 
cube_word_pass1

Recognizes a single word using (only) cube. Compatible with Tesseract's classify_word_pass1/classify_word_pass2.

void cube_word_pass1 (BLOCK *block, ROW *row, WERD_RES *word)
 
cube_recognize_word

Cube recognizer to recognize a single word as with classify_word_pass1 but also returns the cube object in case the combiner is needed.

CubeObjectcube_recognize_word (BLOCK *block, WERD_RES *word)
 
cube_combine_word

Combines the cube and tesseract results for a single word, leaving the result in tess_word.

void cube_combine_word (CubeObject *cube_obj, WERD_RES *cube_word, WERD_RES *tess_word)
 
cube_recognize

Call cube on the current word, and write the result to word. Sets up a fake result and returns false if something goes wrong.

bool cube_recognize (CubeObject *cube_obj, BLOCK *block, WERD_RES *word)
 
fill_werd_res

Fill Tesseract's word result fields with cube's.

void fill_werd_res (const BoxWord &cube_box_word, const char *cube_best_str, WERD_RES *tess_werd_res)
 
extract_cube_state

Extract CharSamp objects and character bounding boxes from the CubeObject's state. The caller should free both structres.

bool extract_cube_state (CubeObject *cube_obj, int *num_chars, Boxa **char_boxes, CharSamp ***char_samples)
 
create_cube_box_word

Fill the given BoxWord with boxes from character bounding boxes. The char_boxes have local coordinates w.r.t. the word bounding box, i.e., the left-most character bbox of each word has (0,0) left-top coord, but the BoxWord must be defined in page coordinates.

bool create_cube_box_word (Boxa *char_boxes, int num_chars, TBOX word_box, BoxWord *box_word)
 
eval_word_spacing()

The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.

Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is prefered.

The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.

Conversly, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.

The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.

BOOL8 digit_or_numeric_punct (WERD_RES *word, int char_position)
 
inT16 eval_word_spacing (WERD_RES_LIST &word_res_list)
 
fix_sp_fp_word()

Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.

void fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
 
inT16 worst_noise_blob (WERD_RES *word_res, float *worst_noise_score)
 
float blob_noise_score (TBLOB *blob)
 
void break_noisiest_blob_word (WERD_RES_LIST &words)
 
fix_fuzzy_spaces()

Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.

Parameters
monitorprogress monitor
word_countcount of words in doc
[out]page_res
void fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_fuzzy_spaces (ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
 
process_selected_words()

Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.

void process_selected_words (PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
 
tess_add_doc_word

Add the given word to the document dictionary

void tess_add_doc_word (WERD_CHOICE *word_choice)
 
tess_acceptable_word
Returns
true if the word is regarded as "good enough".
Parameters
word_choiceafter context
raw_choicebefore context
bool tess_acceptable_word (WERD_RES *word)
 
- Public Member Functions inherited from tesseract::Wordrec
 Wordrec ()
 
virtual ~Wordrec ()
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void WordSearch (WERD_RES *word_res)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void DoSegSearch (WERD_RES *word_res)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_numbered_blob (TWERD *word, inT32 blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (register SPLIT *split)
 
PRIORITY grade_sharpness (register SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const GenericVector< SEAM * > &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, inT16 num_blobs)
 
void get_fragment_lists (inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
void program_editup (const char *textbase, bool init_classifier, bool init_permute)
 
void cc_recog (WERD_RES *word)
 
void program_editdown (inT32 elasped_time)
 
void set_pass1 ()
 
void set_pass2 ()
 
int end_recog ()
 
BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
int dict_word (const WERD_CHOICE &word)
 
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
PRIORITY point_priority (EDGEPT *point)
 
void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
void prioritize_points (TESSLINE *outline, PointHeap *points)
 
void new_min_point (EDGEPT *local_min, PointHeap *points)
 
void new_max_point (EDGEPT *local_max, PointHeap *points)
 
void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
SEAMimprove_one_blob (const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
 
SEAMchop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
 
void chop_word_main (WERD_RES *word)
 
void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
 
int select_blob_to_split (const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
virtual ~Classify ()
 
DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uinT8 ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (FILE *CutoffFile, bool swap, inT64 end_offset, CLASS_CUTOFF_ARRAY Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (FILE *File)
 
FLOAT32 ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (FILE *File, inT64 end_offset)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (bool load_pre_trained_templates)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uinT8 *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uinT8 *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (FLOAT32 Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uinT8 *pruner_norm_array, uinT8 *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uinT8 *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (FILE *File)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()
 
 ~CCStruct ()
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()
 
 ~CUtil ()
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Public Attributes

bool tessedit_resegment_from_boxes = false
 
bool tessedit_resegment_from_line_boxes = false
 
bool tessedit_train_from_boxes = false
 
bool tessedit_make_boxes_from_boxes = false
 
bool tessedit_dump_pageseg_images = false
 
int tessedit_pageseg_mode = PSM_SINGLE_BLOCK
 
int tessedit_ocr_engine_mode = tesseract::OEM_TESSERACT_ONLY
 
char * tessedit_char_blacklist = ""
 
char * tessedit_char_whitelist = ""
 
char * tessedit_char_unblacklist = ""
 
bool tessedit_ambigs_training = false
 
int pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
int ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
char * tessedit_write_params_to_file = ""
 
bool tessedit_adaption_debug = false
 
int bidi_debug = 0
 
int applybox_debug = 1
 
int applybox_page = 0
 
char * applybox_exposure_pattern = ".exp"
 
bool applybox_learn_chars_and_char_frags_mode = false
 
bool applybox_learn_ngrams_mode = false
 
bool tessedit_display_outwords = false
 
bool tessedit_dump_choices = false
 
bool tessedit_timing_debug = false
 
bool tessedit_fix_fuzzy_spaces = true
 
bool tessedit_unrej_any_wd = false
 
bool tessedit_fix_hyphens = true
 
bool tessedit_redo_xheight = true
 
bool tessedit_enable_doc_dict = true
 
bool tessedit_debug_fonts = false
 
bool tessedit_debug_block_rejection = false
 
bool tessedit_enable_bigram_correction = true
 
bool tessedit_enable_dict_correction = false
 
int tessedit_bigram_debug = 0
 
bool enable_noise_removal = true
 
int debug_noise_removal = 0
 
double noise_cert_basechar = -8.0
 
double noise_cert_disjoint = -2.5
 
double noise_cert_punc = -2.5
 
double noise_cert_factor = 0.375
 
int noise_maxperblob = 8
 
int noise_maxperword = 16
 
int debug_x_ht_level = 0
 
bool debug_acceptable_wds = false
 
char * chs_leading_punct = "('`\""
 
char * chs_trailing_punct1 = ").,;:?!"
 
char * chs_trailing_punct2 = ")'`\""
 
double quality_rej_pc = 0.08
 
double quality_blob_pc = 0.0
 
double quality_outline_pc = 1.0
 
double quality_char_pc = 0.95
 
int quality_min_initial_alphas_reqd = 2
 
int tessedit_tess_adaption_mode = 0x27
 
bool tessedit_minimal_rej_pass1 = false
 
bool tessedit_test_adaption = false
 
bool tessedit_matcher_log = false
 
int tessedit_test_adaption_mode = 3
 
bool test_pt = false
 
double test_pt_x = 99999.99
 
double test_pt_y = 99999.99
 
int paragraph_debug_level = 0
 
bool paragraph_text_based = true
 
int cube_debug_level = 1
 
char * outlines_odd = "%| "
 
char * outlines_2 = "ij!?%\":;"
 
bool docqual_excuse_outline_errs = false
 
bool tessedit_good_quality_unrej = true
 
bool tessedit_use_reject_spaces = true
 
double tessedit_reject_doc_percent = 65.00
 
double tessedit_reject_block_percent = 45.00
 
double tessedit_reject_row_percent = 40.00
 
double tessedit_whole_wd_rej_row_percent = 70.00
 
bool tessedit_preserve_blk_rej_perfect_wds = true
 
bool tessedit_preserve_row_rej_perfect_wds = true
 
bool tessedit_dont_blkrej_good_wds = false
 
bool tessedit_dont_rowrej_good_wds = false
 
int tessedit_preserve_min_wd_len = 2
 
bool tessedit_row_rej_good_docs = true
 
double tessedit_good_doc_still_rowrej_wd = 1.1
 
bool tessedit_reject_bad_qual_wds = true
 
bool tessedit_debug_doc_rejection = false
 
bool tessedit_debug_quality_metrics = false
 
bool bland_unrej = false
 
double quality_rowrej_pc = 1.1
 
bool unlv_tilde_crunching = true
 
bool hocr_font_info = false
 
bool crunch_early_merge_tess_fails = true
 
bool crunch_early_convert_bad_unlv_chs = false
 
double crunch_terrible_rating = 80.0
 
bool crunch_terrible_garbage = true
 
double crunch_poor_garbage_cert = -9.0
 
double crunch_poor_garbage_rate = 60
 
double crunch_pot_poor_rate = 40
 
double crunch_pot_poor_cert = -8.0
 
bool crunch_pot_garbage = true
 
double crunch_del_rating = 60
 
double crunch_del_cert = -10.0
 
double crunch_del_min_ht = 0.7
 
double crunch_del_max_ht = 3.0
 
double crunch_del_min_width = 3.0
 
double crunch_del_high_word = 1.5
 
double crunch_del_low_word = 0.5
 
double crunch_small_outlines_size = 0.6
 
int crunch_rating_max = 10
 
int crunch_pot_indicators = 1
 
bool crunch_leave_ok_strings = true
 
bool crunch_accept_ok = true
 
bool crunch_leave_accept_strings = false
 
bool crunch_include_numerals = false
 
int crunch_leave_lc_strings = 4
 
int crunch_leave_uc_strings = 4
 
int crunch_long_repetitions = 3
 
int crunch_debug = 0
 
int fixsp_non_noise_limit = 1
 
double fixsp_small_outlines_size = 0.28
 
bool tessedit_prefer_joined_punct = false
 
int fixsp_done_mode = 1
 
int debug_fix_space_level = 0
 
char * numeric_punctuation = ".,"
 
int x_ht_acceptance_tolerance = 8
 
int x_ht_min_change = 8
 
int superscript_debug = 0
 
double superscript_worse_certainty = 2.0
 
double superscript_bettered_certainty = 0.97
 
double superscript_scaledown_ratio = 0.4
 
double subscript_max_y_top = 0.5
 
double superscript_min_y_bottom = 0.3
 
bool tessedit_write_block_separators = false
 
bool tessedit_write_rep_codes = false
 
bool tessedit_write_unlv = false
 
bool tessedit_create_txt = true
 
bool tessedit_create_hocr = false
 
bool tessedit_create_pdf = false
 
char * unrecognised_char = "|"
 
int suspect_level = 99
 
int suspect_space_level = 100
 
int suspect_short_words = 2
 
bool suspect_constrain_1Il = false
 
double suspect_rating_per_ch = 999.9
 
double suspect_accept_rating = -999.9
 
bool tessedit_minimal_rejection = false
 
bool tessedit_zero_rejection = false
 
bool tessedit_word_for_word = false
 
bool tessedit_zero_kelvin_rejection = false
 
bool tessedit_consistent_reps = true
 
int tessedit_reject_mode = 0
 
bool tessedit_rejection_debug = false
 
bool tessedit_flip_0O = true
 
double tessedit_lower_flip_hyphen = 1.5
 
double tessedit_upper_flip_hyphen = 1.8
 
bool rej_trust_doc_dawg = false
 
bool rej_1Il_use_dict_word = false
 
bool rej_1Il_trust_permuter_type = true
 
bool rej_use_tess_accepted = true
 
bool rej_use_tess_blanks = true
 
bool rej_use_good_perm = true
 
bool rej_use_sensible_wd = false
 
bool rej_alphas_in_number_perm = false
 
double rej_whole_of_mostly_reject_word_fract = 0.85
 
int tessedit_image_border = 2
 
char * ok_repeated_ch_non_alphanum_wds = "-?*\075"
 
char * conflict_set_I_l_1 = "Il1[]"
 
int min_sane_x_ht_pixels = 8
 
bool tessedit_create_boxfile = false
 
int tessedit_page_number = -1
 
bool tessedit_write_images = false
 
bool interactive_display_mode = false
 
char * file_type = ".tif"
 
bool tessedit_override_permuter = true
 
int tessdata_manager_debug_level = 0
 
char * tessedit_load_sublangs = ""
 
bool tessedit_use_primary_params_model = false
 
double min_orientation_margin = 7.0
 
bool textord_tabfind_show_vlines = false
 
bool textord_use_cjk_fp_model = FALSE
 
bool poly_allow_detailed_fx = false
 
bool tessedit_init_config_only = false
 
bool textord_equation_detect = false
 
bool textord_tabfind_vertical_text = true
 
bool textord_tabfind_force_vertical_text = false
 
double textord_tabfind_vertical_text_ratio = 0.5
 
double textord_tabfind_aligned_gap_fraction = 0.75
 
int tessedit_parallelize = 0
 
bool preserve_interword_spaces = false
 
bool include_page_breaks = false
 
char * page_separator = "\f"
 
bool textord_tabfind_vertical_horizontal_mix = true
 
int tessedit_ok_mode = 5
 
bool load_fixed_length_dawgs = true
 
int segment_debug = 0
 
bool permute_debug = 0
 
double bestrate_pruning_factor = 2.0
 
bool permute_script_word = 0
 
bool segment_segcost_rating = 0
 
double segment_reward_script = 0.95
 
bool permute_fixed_length_dawg = 0
 
bool permute_chartype_word = 0
 
double segment_reward_chartype = 0.97
 
double segment_reward_ngram_best_choice = 0.99
 
bool ngram_permuter_activated = false
 
bool permute_only_top = false
 
int language_model_fixed_length_choices_depth = 3
 
bool use_new_state_cost = FALSE
 
double heuristic_segcost_rating_base = 1.25
 
double heuristic_weight_rating = 1
 
double heuristic_weight_width = 1000.0
 
double heuristic_weight_seamcut = 0
 
double heuristic_max_char_wh_ratio = 2.0
 
bool enable_new_segsearch = false
 
double segsearch_max_fixed_pitch_char_wh_ratio = 2.0
 
- Public Attributes inherited from tesseract::Wordrec
bool merge_fragments_in_matrix = TRUE
 
bool wordrec_no_block = FALSE
 
bool wordrec_enable_assoc = TRUE
 
bool force_word_assoc = FALSE
 
double wordrec_worst_state = 1
 
bool fragments_guide_chopper = FALSE
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_seam_pile_size = 150
 
bool chop_new_seam_pile = 1
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
int chop_centered_maxwidth = 90
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
int segment_adjust_debug = 0
 
bool assume_fixed_pitch_char_segment = FALSE
 
int wordrec_debug_level = 0
 
int wordrec_max_join_chunks = 4
 
bool wordrec_skip_no_truth_words = false
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
bool save_alt_choices = true
 
LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< int > blame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool allow_blob_division = true
 
bool prioritize_division = FALSE
 
int tessedit_single_match = FALSE
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_min_norm_scale_x = 0.0
 
double classify_max_norm_scale_x = 0.325
 
double classify_min_norm_scale_y = 0.0
 
double classify_max_norm_scale_y = 0.325
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = TRUE
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = FALSE
 
bool matcher_debug_separate_windows = FALSE
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
bool EnableLearning
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
int il1_adaption_test = 0
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
TessdataManager tessdata_manager
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
char * m_data_sub_dir = "tessdata/"
 
int ambigs_debug_level = 0
 
bool use_definite_ambigs_for_classifier = 0
 
bool use_ambigs_for_adaption = 0
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Member Functions inherited from tesseract::Wordrec
bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Detailed Description

Definition at line 170 of file tesseractclass.h.

Constructor & Destructor Documentation

tesseract::Tesseract::Tesseract ( )

Definition at line 57 of file tesseractclass.cpp.

59  "Take segmentation and labeling from box file",
60  this->params()),
62  "Conversion of word/line box file to char box file",
63  this->params()),
65  "Generate training data from boxed chars", this->params()),
67  "Generate more boxes from boxed chars", this->params()),
69  "Dump intermediate images made during page segmentation",
70  this->params()),
71  // The default for pageseg_mode is the old behaviour, so as not to
72  // upset anything that relies on that.
73  INT_MEMBER(
75  "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
76  " 5=line, 6=word, 7=char"
77  " (Values from PageSegMode enum in publictypes.h)",
78  this->params()),
80  "Which OCR engine(s) to run (Tesseract, Cube, both)."
81  " Defaults to loading and running only Tesseract"
82  " (no Cube,no combiner)."
83  " Values from OcrEngineMode enum in tesseractclass.h)",
84  this->params()),
86  "Blacklist of chars not to recognize", this->params()),
88  "Whitelist of chars to recognize", this->params()),
90  "List of chars to override tessedit_char_blacklist",
91  this->params()),
93  "Perform training for ambiguities", this->params()),
96  "Whether to use the top-line splitting process for Devanagari "
97  "documents while performing page-segmentation.",
98  this->params()),
101  "Whether to use the top-line splitting process for Devanagari "
102  "documents while performing ocr.",
103  this->params()),
105  "Write all parameters to the given file.", this->params()),
107  "Generate and print debug"
108  " information for adaption",
109  this->params()),
110  INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
111  INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
112  INT_MEMBER(applybox_page, 0, "Page number to apply boxes from",
113  this->params()),
115  "Exposure value follows"
116  " this pattern in the image filename. The name of the image"
117  " files are expected to be in the form"
118  " [lang].[fontname].exp[num].tif",
119  this->params()),
121  "Learn both character fragments (as is done in the"
122  " special low exposure mode) as well as unfragmented"
123  " characters.",
124  this->params()),
126  "Each bounding box"
127  " is assumed to contain ngrams. Only learn the ngrams"
128  " whose outlines overlap horizontally.",
129  this->params()),
130  BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words",
131  this->params()),
132  BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices",
133  this->params()),
134  BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
135  this->params()),
137  "Try to improve fuzzy spaces", this->params()),
139  "Dont bother with word plausibility", this->params()),
140  BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
141  this->params()),
142  BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
143  this->params()),
145  "Add words to the document dictionary", this->params()),
146  BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char",
147  this->params()),
148  BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats",
149  this->params()),
151  "Enable correction based on the word bigram dictionary.",
152  this->params()),
154  "Enable single word correction based on the dictionary.",
155  this->params()),
157  "Amount of debug output for bigram correction.",
158  this->params()),
160  "Remove and conditionally reassign small outlines when they"
161  " confuse layout analysis, determining diacritics vs noise",
162  this->params()),
163  INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines",
164  this->params()),
165  // Worst (min) certainty, for which a diacritic is allowed to make the
166  // base
167  // character worse and still be included.
169  "Hingepoint for base char certainty", this->params()),
170  // Worst (min) certainty, for which a non-overlapping diacritic is allowed
171  // to make the base character worse and still be included.
173  "Hingepoint for disjoint certainty", this->params()),
174  // Worst (min) certainty, for which a diacritic is allowed to make a new
175  // stand-alone blob.
177  "Threshold for new punc char certainty", this->params()),
178  // Factor of certainty margin for adding diacritics to not count as worse.
180  "Scaling on certainty diff from Hingepoint",
181  this->params()),
182  INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob",
183  this->params()),
184  INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word",
185  this->params()),
186  INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
187  BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk",
188  this->params()),
189  STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation",
190  this->params()),
191  STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation",
192  this->params()),
193  STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation",
194  this->params()),
196  "good_quality_doc lte rejection limit", this->params()),
198  "good_quality_doc gte good blobs limit", this->params()),
200  "good_quality_doc lte outline error limit", this->params()),
202  "good_quality_doc gte good char limit", this->params()),
203  INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word",
204  this->params()),
206  "Adaptation decision algorithm for tess", this->params()),
208  "Do minimal rejection on pass 1 output", this->params()),
209  BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria",
210  this->params()),
211  BOOL_MEMBER(tessedit_matcher_log, false, "Log matcher activity",
212  this->params()),
214  "Adaptation decision algorithm for tess", this->params()),
215  BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
216  double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
217  double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
218  INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
219  this->params()),
221  "Run paragraph detection on the post-text-recognition "
222  "(more accurate)",
223  this->params()),
224  INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()),
225  STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
226  this->params()),
227  STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines",
228  this->params()),
230  "Allow outline errs in unrejection?", this->params()),
232  "Reduce rejection on good docs", this->params()),
233  BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?",
234  this->params()),
236  "%rej allowed before rej whole doc", this->params()),
238  "%rej allowed before rej whole block", this->params()),
240  "%rej allowed before rej whole row", this->params()),
242  "Number of row rejects in whole word rejects"
243  "which prevents whole row rejection",
244  this->params()),
246  "Only rej partially rejected words in block rejection",
247  this->params()),
249  "Only rej partially rejected words in row rejection",
250  this->params()),
252  "Use word segmentation quality metric", this->params()),
254  "Use word segmentation quality metric", this->params()),
256  "Only preserve wds longer than this", this->params()),
258  "Apply row rejection to good docs", this->params()),
260  "rej good doc wd if more than this fraction rejected",
261  this->params()),
263  "Reject all bad quality wds", this->params()),
264  BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats",
265  this->params()),
267  "Output data to debug file", this->params()),
268  BOOL_MEMBER(bland_unrej, false, "unrej potential with no chekcs",
269  this->params()),
271  "good_quality_doc gte good char limit", this->params()),
273  "Mark v.bad words for tilde crunch", this->params()),
274  BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
275  this->params()),
276  BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
277  this->params()),
279  "Take out ~^ early?", this->params()),
280  double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this",
281  this->params()),
282  BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
284  "crunch garbage cert lt this", this->params()),
286  "crunch garbage rating lt this", this->params()),
287  double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this",
288  this->params()),
289  double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this",
290  this->params()),
291  BOOL_MEMBER(crunch_pot_garbage, true, "POTENTIAL crunch garbage",
292  this->params()),
293  double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this",
294  this->params()),
295  double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this",
296  this->params()),
297  double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this",
298  this->params()),
299  double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this",
300  this->params()),
302  "Del if word width lt xht x this", this->params()),
304  "Del if word gt xht x this above bl", this->params()),
306  "Del if word gt xht x this below bl", this->params()),
307  double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this",
308  this->params()),
309  INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch",
310  this->params()),
312  "How many potential indicators needed", this->params()),
313  BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings",
314  this->params()),
315  BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
316  this->params()),
318  "Dont pot crunch sensible strings", this->params()),
319  BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
320  this->params()),
322  "Dont crunch words with long lower case strings",
323  this->params()),
325  "Dont crunch words with long lower case strings",
326  this->params()),
328  "Crunch words with long repetitions", this->params()),
329  INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
331  "How many non-noise blbs either side?", this->params()),
332  double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this",
333  this->params()),
335  "Reward punctation joins", this->params()),
336  INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing",
337  this->params()),
338  INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug",
339  this->params()),
341  "Punct. chs expected WITHIN numbers", this->params()),
343  "Max allowed deviation of blob top outside of font data",
344  this->params()),
346  "Min change in xht before actually trying it", this->params()),
348  "Debug level for sub & superscript fixer", this->params()),
351  "How many times worse "
352  "certainty does a superscript position glyph need to be for "
353  "us to try classifying it as a char with a different "
354  "baseline?",
355  this->params()),
358  "What reduction in "
359  "badness do we think sufficient to choose a superscript "
360  "over what we'd thought. For example, a value of 0.6 means "
361  "we want to reduce badness of certainty by at least 40%",
362  this->params()),
364  "A superscript scaled down more than this is unbelievably "
365  "small. For example, 0.3 means we expect the font size to "
366  "be no smaller than 30% of the text line font size.",
367  this->params()),
369  "Maximum top of a character measured as a multiple of "
370  "x-height above the baseline for us to reconsider whether "
371  "it's a subscript.",
372  this->params()),
374  "Minimum bottom of a character measured as a multiple of "
375  "x-height above the baseline for us to reconsider whether "
376  "it's a superscript.",
377  this->params()),
379  "Write block separators in output", this->params()),
380  BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code",
381  this->params()),
382  BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
383  this->params()),
384  BOOL_MEMBER(tessedit_create_txt, true, "Write .txt output file",
385  this->params()),
386  BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
387  this->params()),
388  BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
389  this->params()),
391  "Output char for unidentified blobs", this->params()),
392  INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
394  "Min suspect level for rejecting spaces", this->params()),
396  "Dont Suspect dict wds longer than this", this->params()),
397  BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
398  this->params()),
399  double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit",
400  this->params()),
401  double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
402  this->params()),
404  "Only reject tess failures", this->params()),
405  BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING",
406  this->params()),
408  "Make output have exactly one word per WERD", this->params()),
410  "Dont reject ANYTHING AT ALL", this->params()),
412  "Force all rep chars the same", this->params()),
413  INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
414  this->params()),
415  BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug",
416  this->params()),
417  BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips",
418  this->params()),
420  "Aspect ratio dot/hyphen test", this->params()),
422  "Aspect ratio dot/hyphen test", this->params()),
424  "Use DOC dawg in 11l conf. detector", this->params()),
425  BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
426  this->params()),
427  BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check",
428  this->params()),
429  BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
430  this->params()),
431  BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control",
432  this->params()),
433  BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control",
434  this->params()),
435  BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check",
436  this->params()),
437  BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check",
438  this->params()),
440  "if >this fract", this->params()),
441  INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit",
442  this->params()),
444  "Allow NN to unrej", this->params()),
445  STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set",
446  this->params()),
447  INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this",
448  this->params()),
449  BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes",
450  this->params()),
452  "-1 -> All pages"
453  " , else specifc page to process",
454  this->params()),
456  "Capture the image from the IPE", this->params()),
457  BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
458  this->params()),
459  STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
460  BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
461  this->params()),
463  "Debug level for"
464  " TessdataManager functions.",
465  this->params()),
467  "List of languages to load with this one", this->params()),
469  "In multilingual mode use params model of the"
470  " primary language",
471  this->params()),
473  "Min acceptable orientation margin", this->params()),
474  BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
475  this->params()),
476  BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model",
477  this->params()),
479  "Allow feature extractors to see the original outline",
480  this->params()),
482  "Only initialize with the config file. Useful if the "
483  "instance is not going to be used for OCR but say only "
484  "for layout analysis.",
485  this->params()),
486  BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
487  this->params()),
489  "Enable vertical detection", this->params()),
491  "Force using vertical text page mode", this->params()),
494  "Fraction of textlines deemed vertical to use vertical page "
495  "mode",
496  this->params()),
499  "Fraction of height used as a minimum gap for aligned blobs.",
500  this->params()),
501  INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
502  this->params()),
504  "Preserve multiple interword spaces", this->params()),
506  "Include page separator string in output text after each "
507  "image/page.",
508  this->params()),
510  "Page separator (default is form feed control character)",
511  this->params()),
512 
513  // The following parameters were deprecated and removed from their
514  // original
515  // locations. The parameters are temporarily kept here to give Tesseract
516  // users a chance to updated their [lang].traineddata and config files
517  // without introducing failures during Tesseract initialization.
518  // TODO(ocr-team): remove these parameters from the code once we are
519  // reasonably sure that Tesseract users have updated their data files.
520  //
521  // BEGIN DEPRECATED PARAMETERS
523  "find horizontal lines such as headers in vertical page mode",
524  this->params()),
525  INT_MEMBER(tessedit_ok_mode, 5, "Acceptance decision algorithm",
526  this->params()),
528  "Load fixed length dawgs"
529  " (e.g. for non-space delimited languages)",
530  this->params()),
531  INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process",
532  this->params()),
533  BOOL_MEMBER(permute_debug, 0, "Debug char permutation process",
534  this->params()),
536  "Multiplying factor of"
537  " current best rate to prune other hypotheses",
538  this->params()),
540  "Turn on word script consistency permuter", this->params()),
542  "incorporate segmentation cost in word rating?",
543  this->params()),
545  "Score multipler for script consistency within a word. "
546  "Being a 'reward' factor, it should be <= 1. "
547  "Smaller value implies bigger reward.",
548  this->params()),
550  "Turn on fixed-length phrasebook search permuter",
551  this->params()),
553  "Turn on character type (property) consistency permuter",
554  this->params()),
556  "Score multipler for char type consistency within a word. ",
557  this->params()),
559  "Score multipler for ngram permuter's best choice"
560  " (only used in the Han script path).",
561  this->params()),
563  "Activate character-level n-gram-based permuter",
564  this->params()),
565  BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter",
566  this->params()),
568  "Depth of blob choice lists to explore"
569  " when fixed length dawgs are on",
570  this->params()),
572  "use new state cost heuristics for segmentation state"
573  " evaluation",
574  this->params()),
576  "base factor for adding segmentation cost into word rating."
577  "It's a multiplying factor, the larger the value above 1, "
578  "the bigger the effect of segmentation cost.",
579  this->params()),
581  "weight associated with char rating in combined cost of"
582  "state",
583  this->params()),
585  "weight associated with width evidence in combined cost of"
586  " state",
587  this->params()),
589  "weight associated with seam cut in combined cost of state",
590  this->params()),
592  "max char width-to-height ratio allowed in segmentation",
593  this->params()),
595  "Enable new segmentation search path.", this->params()),
597  "Maximum character width-to-height ratio for"
598  " fixed-pitch fonts",
599  this->params()),
600  // END DEPRECATED PARAMETERS
601 
602  backup_config_file_(NULL),
603  pix_binary_(NULL),
604  cube_binary_(NULL),
605  pix_grey_(NULL),
606  pix_thresholds_(NULL),
607  source_resolution_(0),
608  textord_(this),
609  right_to_left_(false),
610  scaled_color_(NULL),
611  scaled_factor_(-1),
612  deskew_(1.0f, 0.0f),
613  reskew_(1.0f, 0.0f),
614  most_recently_used_(this),
615  font_table_size_(0),
616 #ifndef ANDROID_BUILD
617  cube_cntxt_(NULL),
618  tess_cube_combiner_(NULL),
619 #endif
620  equ_detect_(NULL) {
621 }
double superscript_scaledown_ratio
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:307
double segment_reward_ngram_best_choice
char * ok_repeated_ch_non_alphanum_wds
char * tessedit_write_params_to_file
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:304
double tessedit_whole_wd_rej_row_percent
double tessedit_reject_block_percent
double textord_tabfind_vertical_text_ratio
bool crunch_early_convert_bad_unlv_chs
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:316
bool textord_tabfind_vertical_horizontal_mix
bool tessedit_enable_bigram_correction
bool tessedit_resegment_from_line_boxes
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:301
double tessedit_reject_row_percent
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:160
double heuristic_segcost_rating_base
double rej_whole_of_mostly_reject_word_fract
ParamsVectors * params()
Definition: ccutil.h:65
bool tessedit_preserve_row_rej_perfect_wds
#define FALSE
Definition: capi.h:29
double tessedit_good_doc_still_rowrej_wd
double tessedit_reject_doc_percent
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:310
#define NULL
Definition: host.h:144
double superscript_worse_certainty
bool textord_tabfind_force_vertical_text
double segsearch_max_fixed_pitch_char_wh_ratio
bool applybox_learn_chars_and_char_frags_mode
bool tessedit_preserve_blk_rej_perfect_wds
double textord_tabfind_aligned_gap_fraction
#define INT_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:313
double superscript_bettered_certainty
int language_model_fixed_length_choices_depth
tesseract::Tesseract::~Tesseract ( )

Definition at line 623 of file tesseractclass.cpp.

623  {
624  Clear();
625  end_tesseract();
626  sub_langs_.delete_data_pointers();
627 #ifndef ANDROID_BUILD
628  // Delete cube objects.
629  if (cube_cntxt_ != NULL) {
630  delete cube_cntxt_;
631  cube_cntxt_ = NULL;
632  }
633  if (tess_cube_combiner_ != NULL) {
634  delete tess_cube_combiner_;
635  tess_cube_combiner_ = NULL;
636  }
637 #endif
638 }
#define NULL
Definition: host.h:144

Member Function Documentation

BOOL8 tesseract::Tesseract::acceptable_number_string ( const char *  s,
const char *  lengths 
)

Definition at line 421 of file output.cpp.

422  {
423  BOOL8 prev_digit = FALSE;
424 
425  if (*lengths == 1 && *s == '(')
426  s++;
427 
428  if (*lengths == 1 &&
429  ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
430  s++;
431 
432  for (; *s != '\0'; s += *(lengths++)) {
433  if (unicharset.get_isdigit(s, *lengths))
434  prev_digit = TRUE;
435  else if (prev_digit &&
436  (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
437  prev_digit = FALSE;
438  else if (prev_digit && *lengths == 1 &&
439  (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
440  return TRUE;
441  else if (prev_digit &&
442  *lengths == 1 && (*s == '%') &&
443  (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
444  (*(s + *lengths + *(lengths + 1)) == '\0'))
445  return TRUE;
446  else
447  return FALSE;
448  }
449  return TRUE;
450 }
UNICHARSET unicharset
Definition: ccutil.h:72
unsigned char BOOL8
Definition: host.h:113
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string ( const UNICHARSET char_set,
const char *  s,
const char *  lengths 
)

Definition at line 1663 of file control.cpp.

1664  {
1665  int i = 0;
1666  int offset = 0;
1667  int leading_punct_count;
1668  int upper_count = 0;
1669  int hyphen_pos = -1;
1671 
1672  if (strlen (lengths) > 20)
1673  return word_type;
1674 
1675  /* Single Leading punctuation char*/
1676 
1677  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1678  offset += lengths[i++];
1679  leading_punct_count = i;
1680 
1681  /* Initial cap */
1682  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1683  offset += lengths[i++];
1684  upper_count++;
1685  }
1686  if (upper_count > 1) {
1687  word_type = AC_UPPER_CASE;
1688  } else {
1689  /* Lower case word, possibly with an initial cap */
1690  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1691  offset += lengths[i++];
1692  }
1693  if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1694  goto not_a_word;
1695  /*
1696  Allow a single hyphen in a lower case word
1697  - dont trust upper case - I've seen several cases of "H" -> "I-I"
1698  */
1699  if (lengths[i] == 1 && s[offset] == '-') {
1700  hyphen_pos = i;
1701  offset += lengths[i++];
1702  if (s[offset] != '\0') {
1703  while ((s[offset] != '\0') &&
1704  char_set.get_islower(s + offset, lengths[i])) {
1705  offset += lengths[i++];
1706  }
1707  if (i < hyphen_pos + 3)
1708  goto not_a_word;
1709  }
1710  } else {
1711  /* Allow "'s" in NON hyphenated lower case words */
1712  if (lengths[i] == 1 && (s[offset] == '\'') &&
1713  lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1714  offset += lengths[i++];
1715  offset += lengths[i++];
1716  }
1717  }
1718  if (upper_count > 0)
1719  word_type = AC_INITIAL_CAP;
1720  else
1721  word_type = AC_LOWER_CASE;
1722  }
1723 
1724  /* Up to two different, constrained trailing punctuation chars */
1725  if (lengths[i] == 1 && s[offset] != '\0' &&
1726  STRING(chs_trailing_punct1).contains(s[offset]))
1727  offset += lengths[i++];
1728  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1729  s[offset - lengths[i - 1]] != s[offset] &&
1730  STRING(chs_trailing_punct2).contains (s[offset]))
1731  offset += lengths[i++];
1732 
1733  if (s[offset] != '\0')
1734  word_type = AC_UNACCEPTABLE;
1735 
1736  not_a_word:
1737 
1738  if (word_type == AC_UNACCEPTABLE) {
1739  /* Look for abbreviation string */
1740  i = 0;
1741  offset = 0;
1742  if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1743  word_type = AC_UC_ABBREV;
1744  while (s[offset] != '\0' &&
1745  char_set.get_isupper(s + offset, lengths[i]) &&
1746  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1747  offset += lengths[i++];
1748  offset += lengths[i++];
1749  }
1750  }
1751  else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1752  word_type = AC_LC_ABBREV;
1753  while (s[offset] != '\0' &&
1754  char_set.get_islower(s + offset, lengths[i]) &&
1755  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1756  offset += lengths[i++];
1757  offset += lengths[i++];
1758  }
1759  }
1760  if (s[offset] != '\0')
1761  word_type = AC_UNACCEPTABLE;
1762  }
1763 
1764  return word_type;
1765 }
a.b.c.
Definition: control.h:40
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
A.B.C.
Definition: control.h:41
ALL upper case.
Definition: control.h:38
ACCEPTABLE_WERD_TYPE
Definition: control.h:34
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
ALL lower case.
Definition: control.h:37
Unacceptable word.
Definition: control.h:36
ALL but initial lc.
Definition: control.h:39
Definition: strngs.h:44
BOOL8 contains(const char c) const
Definition: strngs.cpp:184
inT16 tesseract::Tesseract::alpha_count ( const char *  word,
const char *  word_lengths 
)

Definition at line 495 of file reject.cpp.

496  {
497  inT16 i;
498  inT16 offset;
499  inT16 count = 0;
500 
501  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
502  if (unicharset.get_isalpha (word + offset, word_lengths[i]))
503  count++;
504  }
505  return count;
506 }
UNICHARSET unicharset
Definition: ccutil.h:72
int count(LIST var_list)
Definition: oldlist.cpp:108
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
short inT16
Definition: host.h:100
void tesseract::Tesseract::ambigs_classify_and_output ( const char *  label,
PAGE_RES_IT pr_it,
FILE *  output_file 
)

Definition at line 203 of file recogtraining.cpp.

205  {
206  // Classify word.
207  fflush(stdout);
208  WordData word_data(*pr_it);
209  SetupWordPassN(1, &word_data);
210  classify_word_and_language(1, pr_it, &word_data);
211  WERD_RES* werd_res = word_data.word;
212  WERD_CHOICE *best_choice = werd_res->best_choice;
213  ASSERT_HOST(best_choice != NULL);
214 
215  // Compute the number of unichars in the label.
216  GenericVector<UNICHAR_ID> encoding;
217  if (!unicharset.encode_string(label, true, &encoding, NULL, NULL)) {
218  tprintf("Not outputting illegal unichar %s\n", label);
219  return;
220  }
221 
222  // Dump all paths through the ratings matrix (which is normally small).
223  int dim = werd_res->ratings->dimension();
224  const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim];
225  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices,
226  unicharset, label, output_file);
227  delete [] blob_choices;
228 }
MATRIX * ratings
Definition: pageres.h:215
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
int dimension() const
Definition: matrix.h:247
#define ASSERT_HOST(x)
Definition: errcode.h:84
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:171
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:234
WERD * word
Definition: pageres.h:175
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1268
#define NULL
Definition: host.h:144
bool tesseract::Tesseract::AnyTessLang ( ) const
inline

Definition at line 258 of file tesseractclass.h.

258  {
259  if (tessedit_ocr_engine_mode != OEM_CUBE_ONLY) return true;
260  for (int i = 0; i < sub_langs_.size(); ++i) {
261  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_CUBE_ONLY)
262  return true;
263  }
264  return false;
265  }
PAGE_RES * tesseract::Tesseract::ApplyBoxes ( const STRING fname,
bool  find_segmentation,
BLOCK_LIST *  block_list 
)

Definition at line 117 of file applybox.cpp.

119  {
120  GenericVector<TBOX> boxes;
121  GenericVector<STRING> texts, full_texts;
122  if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
123  NULL)) {
124  return NULL; // Can't do it.
125  }
126 
127  int box_count = boxes.size();
128  int box_failures = 0;
129  // Add an empty everything to the end.
130  boxes.push_back(TBOX());
131  texts.push_back(STRING());
132  full_texts.push_back(STRING());
133 
134  // In word mode, we use the boxes to make a word for each box, but
135  // in blob mode we use the existing words and maximally chop them first.
136  PAGE_RES* page_res = find_segmentation ?
137  NULL : SetupApplyBoxes(boxes, block_list);
138  clear_any_old_text(block_list);
139 
140  for (int i = 0; i < boxes.size() - 1; i++) {
141  bool foundit = false;
142  if (page_res != NULL) {
143  if (i == 0) {
144  foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
145  full_texts[i].string());
146  } else {
147  foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
148  boxes[i + 1], full_texts[i].string());
149  }
150  } else {
151  foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
152  texts[i].string());
153  }
154  if (!foundit) {
155  box_failures++;
156  ReportFailedBox(i, boxes[i], texts[i].string(),
157  "FAILURE! Couldn't find a matching blob");
158  }
159  }
160 
161  if (page_res == NULL) {
162  // In word/line mode, we now maximally chop all the words and resegment
163  // them with the classifier.
164  page_res = SetupApplyBoxes(boxes, block_list);
165  ReSegmentByClassification(page_res);
166  }
167  if (applybox_debug > 0) {
168  tprintf("APPLY_BOXES:\n");
169  tprintf(" Boxes read from boxfile: %6d\n", box_count);
170  if (box_failures > 0)
171  tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
172  }
173  TidyUp(page_res);
174  return page_res;
175 }
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
Definition: applybox.cpp:340
int size() const
Definition: genericvector.h:72
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:706
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
Definition: applybox.cpp:764
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
Definition: applybox.cpp:438
Definition: rect.h:30
Definition: strngs.h:44
#define NULL
Definition: host.h:144
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:217
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:51
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:509
void tesseract::Tesseract::ApplyBoxTraining ( const STRING fontname,
PAGE_RES page_res 
)

Calls LearnWord to extract features for labelled blobs within each word. Features are stored in an internal buffer.

Definition at line 796 of file applybox.cpp.

796  {
797  PAGE_RES_IT pr_it(page_res);
798  int word_count = 0;
799  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
800  word_res = pr_it.forward()) {
801  LearnWord(fontname.string(), word_res);
802  ++word_count;
803  }
804  tprintf("Generated training data for %d words\n", word_count);
805 }
#define tprintf(...)
Definition: tprintf.h:31
WERD * word
Definition: pageres.h:175
#define NULL
Definition: host.h:144
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::AssignDiacriticsToNewBlobs ( const GenericVector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< C_BLOB * > *  target_blobs 
)

Definition at line 1029 of file control.cpp.

1032  {
1033  GenericVector<bool> blob_wanted;
1034  word_wanted->init_to_size(outlines.size(), false);
1035  target_blobs->init_to_size(outlines.size(), NULL);
1036  // Check for outlines that need to be turned into stand-alone blobs.
1037  for (int i = 0; i < outlines.size(); ++i) {
1038  if (outlines[i] == NULL) continue;
1039  // Get a set of adjacent outlines that don't overlap any existing blob.
1040  blob_wanted.init_to_size(outlines.size(), false);
1041  int num_blob_outlines = 0;
1042  TBOX total_ol_box(outlines[i]->bounding_box());
1043  while (i < outlines.size() && outlines[i] != NULL) {
1044  blob_wanted[i] = true;
1045  total_ol_box += outlines[i]->bounding_box();
1046  ++i;
1047  ++num_blob_outlines;
1048  }
1049  // Find the insertion point.
1050  C_BLOB_IT blob_it(real_word->cblob_list());
1051  while (!blob_it.at_last() &&
1052  blob_it.data_relative(1)->bounding_box().left() <=
1053  total_ol_box.left()) {
1054  blob_it.forward();
1055  }
1056  // Choose which combination of them we actually want and where to put
1057  // them.
1058  if (debug_noise_removal)
1059  tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1060  C_BLOB* left_blob = blob_it.data();
1061  TBOX left_box = left_blob->bounding_box();
1062  C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1);
1063  if ((left_box.x_overlap(total_ol_box) || right_blob == NULL ||
1064  !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1065  SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
1066  outlines, num_blob_outlines,
1067  &blob_wanted)) {
1068  if (debug_noise_removal) tprintf("Added to left blob\n");
1069  for (int j = 0; j < blob_wanted.size(); ++j) {
1070  if (blob_wanted[j]) {
1071  (*word_wanted)[j] = true;
1072  (*target_blobs)[j] = left_blob;
1073  }
1074  }
1075  } else if (right_blob != NULL &&
1076  (!left_box.x_overlap(total_ol_box) ||
1077  right_blob->bounding_box().x_overlap(total_ol_box)) &&
1079  right_blob, outlines,
1080  num_blob_outlines, &blob_wanted)) {
1081  if (debug_noise_removal) tprintf("Added to right blob\n");
1082  for (int j = 0; j < blob_wanted.size(); ++j) {
1083  if (blob_wanted[j]) {
1084  (*word_wanted)[j] = true;
1085  (*target_blobs)[j] = right_blob;
1086  }
1087  }
1088  } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL,
1089  outlines, num_blob_outlines,
1090  &blob_wanted)) {
1091  if (debug_noise_removal) tprintf("Fitted between blobs\n");
1092  for (int j = 0; j < blob_wanted.size(); ++j) {
1093  if (blob_wanted[j]) {
1094  (*word_wanted)[j] = true;
1095  (*target_blobs)[j] = NULL;
1096  }
1097  }
1098  }
1099  }
1100 }
int size() const
Definition: genericvector.h:72
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1105
#define tprintf(...)
Definition: tprintf.h:31
void init_to_size(int size, T t)
bool x_overlap(const TBOX &box) const
Definition: rect.h:391
Definition: rect.h:30
TBOX bounding_box() const
Definition: stepblob.cpp:250
#define NULL
Definition: host.h:144
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
void tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs ( const GenericVector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< bool > *  overlapped_any_blob,
GenericVector< C_BLOB * > *  target_blobs 
)

Definition at line 976 of file control.cpp.

980  {
981  GenericVector<bool> blob_wanted;
982  word_wanted->init_to_size(outlines.size(), false);
983  overlapped_any_blob->init_to_size(outlines.size(), false);
984  target_blobs->init_to_size(outlines.size(), NULL);
985  // For each real blob, find the outlines that seriously overlap it.
986  // A single blob could be several merged characters, so there can be quite
987  // a few outlines overlapping, and the full engine needs to be used to chop
988  // and join to get a sensible result.
989  C_BLOB_IT blob_it(real_word->cblob_list());
990  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
991  C_BLOB* blob = blob_it.data();
992  TBOX blob_box = blob->bounding_box();
993  blob_wanted.init_to_size(outlines.size(), false);
994  int num_blob_outlines = 0;
995  for (int i = 0; i < outlines.size(); ++i) {
996  if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
997  !(*word_wanted)[i]) {
998  blob_wanted[i] = true;
999  (*overlapped_any_blob)[i] = true;
1000  ++num_blob_outlines;
1001  }
1002  }
1003  if (debug_noise_removal) {
1004  tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1005  blob_box.print();
1006  }
1007  // If any outlines overlap the blob, and not too many, classify the blob
1008  // (using the full engine, languages and all), and choose the maximal
1009  // combination of outlines that doesn't hurt the end-result classification
1010  // by too much. Mark them as wanted.
1011  if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1012  if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
1013  outlines, num_blob_outlines,
1014  &blob_wanted)) {
1015  for (int i = 0; i < blob_wanted.size(); ++i) {
1016  if (blob_wanted[i]) {
1017  // Claim the outline and record where it is going.
1018  (*word_wanted)[i] = true;
1019  (*target_blobs)[i] = blob;
1020  }
1021  }
1022  }
1023  }
1024  }
1025 }
int size() const
Definition: genericvector.h:72
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1105
#define tprintf(...)
Definition: tprintf.h:31
void print() const
Definition: rect.h:270
void init_to_size(int size, T t)
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
Definition: rect.h:30
TBOX bounding_box() const
Definition: stepblob.cpp:250
#define NULL
Definition: host.h:144
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
int tesseract::Tesseract::AutoPageSeg ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks,
BLOBNBOX_LIST *  diacritic_blobs,
Tesseract osd_tess,
OSResults osr 
)

Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.

Resolution (in ppi) is derived from the input image.

The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.

If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.

If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout anaylsis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.

If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).

Definition at line 232 of file pagesegmain.cpp.

235  {
236  if (textord_debug_images) {
237  WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
238  }
239  Pix* photomask_pix = NULL;
240  Pix* musicmask_pix = NULL;
241  // The blocks made by the ColumnFinder. Moved to blocks before return.
242  BLOCK_LIST found_blocks;
243  TO_BLOCK_LIST temp_blocks;
244 
245  ColumnFinder* finder = SetupPageSegAndDetectOrientation(
246  pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
247  &musicmask_pix);
248  int result = 0;
249  if (finder != NULL) {
250  TO_BLOCK_IT to_block_it(&temp_blocks);
251  TO_BLOCK* to_block = to_block_it.data();
252  if (musicmask_pix != NULL) {
253  // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
254  // blocks separately. For now combine with photomask_pix.
255  pixOr(photomask_pix, photomask_pix, musicmask_pix);
256  }
257  if (equ_detect_) {
258  finder->SetEquationDetect(equ_detect_);
259  }
260  result = finder->FindBlocks(
261  pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix,
262  pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks);
263  if (result >= 0)
264  finder->GetDeskewVectors(&deskew_, &reskew_);
265  delete finder;
266  }
267  pixDestroy(&photomask_pix);
268  pixDestroy(&musicmask_pix);
269  if (result < 0) return result;
270 
271  blocks->clear();
272  BLOCK_IT block_it(blocks);
273  // Move the found blocks to the input/output blocks.
274  block_it.add_list_after(&found_blocks);
275 
276  if (textord_debug_images) {
277  // The debug image is no longer needed so delete it.
278  unlink(AlignedBlob::textord_debug_pix().string());
279  }
280  return result;
281 }
bool textord_debug_images
Definition: alignedblob.cpp:33
static const STRING & textord_debug_pix()
Definition: alignedblob.h:112
bool textord_debug_printable
Definition: alignedblob.cpp:34
#define NULL
Definition: host.h:144
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
bool tesseract::Tesseract::BelievableSuperscript ( bool  debug,
const WERD_RES word,
float  certainty_threshold,
int *  left_ok,
int *  right_ok 
) const

Return whether this is believable superscript or subscript text.

We insist that:

  • there are no punctuation marks.
  • there are no italics.
  • no normal-sized character is smaller than superscript_scaledown_ratio of what it ought to be, and
  • each character is at least as certain as certainty_threshold.
Parameters
[in]debugIf true, spew debug output
[in]wordThe word whose best_choice we're evaluating
[in]certainty_thresholdIf any of the characters have less certainty than this, reject.
[out]left_okHow many left-side characters were ok?
[out]right_okHow many right-side characters were ok?
Returns
Whether the complete best choice is believable as a superscript.

Definition at line 520 of file superscript.cpp.

524  {
525  int initial_ok_run_count = 0;
526  int ok_run_count = 0;
527  float worst_certainty = 0.0f;
528  const WERD_CHOICE &wc = *word.best_choice;
529 
530  const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
531  for (int i = 0; i < wc.length(); i++) {
532  TBLOB *blob = word.rebuild_word->blobs[i];
533  UNICHAR_ID unichar_id = wc.unichar_id(i);
534  float char_certainty = wc.certainty(i);
535  bool bad_certainty = char_certainty < certainty_threshold;
536  bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
537  bool is_italic = word.fontinfo && word.fontinfo->is_italic();
538  BLOB_CHOICE *choice = word.GetBlobChoice(i);
539  if (choice && fontinfo_table.size() > 0) {
540  // Get better information from the specific choice, if available.
541  int font_id1 = choice->fontinfo_id();
542  bool font1_is_italic = font_id1 >= 0
543  ? fontinfo_table.get(font_id1).is_italic() : false;
544  int font_id2 = choice->fontinfo_id2();
545  is_italic = font1_is_italic &&
546  (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
547  }
548 
549  float height_fraction = 1.0f;
550  float char_height = blob->bounding_box().height();
551  float normal_height = char_height;
552  if (wc.unicharset()->top_bottom_useful()) {
553  int min_bot, max_bot, min_top, max_top;
554  wc.unicharset()->get_top_bottom(unichar_id,
555  &min_bot, &max_bot,
556  &min_top, &max_top);
557  float hi_height = max_top - max_bot;
558  float lo_height = min_top - min_bot;
559  normal_height = (hi_height + lo_height) / 2;
560  if (normal_height >= kBlnXHeight) {
561  // Only ding characters that we have decent information for because
562  // they're supposed to be normal sized, not tiny specks or dashes.
563  height_fraction = char_height / normal_height;
564  }
565  }
566  bool bad_height = height_fraction < superscript_scaledown_ratio;
567 
568  if (debug) {
569  if (is_italic) {
570  tprintf(" Rejecting: superscript is italic.\n");
571  }
572  if (is_punc) {
573  tprintf(" Rejecting: punctuation present.\n");
574  }
575  const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
576  if (bad_certainty) {
577  tprintf(" Rejecting: don't believe character %s with certainty %.2f "
578  "which is less than threshold %.2f\n", char_str,
579  char_certainty, certainty_threshold);
580  }
581  if (bad_height) {
582  tprintf(" Rejecting: character %s seems too small @ %.2f versus "
583  "expected %.2f\n", char_str, char_height, normal_height);
584  }
585  }
586  if (bad_certainty || bad_height || is_punc || is_italic) {
587  if (ok_run_count == i) {
588  initial_ok_run_count = ok_run_count;
589  }
590  ok_run_count = 0;
591  } else {
592  ok_run_count++;
593  }
594  if (char_certainty < worst_certainty) {
595  worst_certainty = char_certainty;
596  }
597  }
598  bool all_ok = ok_run_count == wc.length();
599  if (all_ok && debug) {
600  tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
601  }
602  if (!all_ok) {
603  if (left_ok) *left_ok = initial_ok_run_count;
604  if (right_ok) *right_ok = ok_run_count;
605  }
606  return all_ok;
607 }
double superscript_scaledown_ratio
const int kBlnXHeight
Definition: normalis.h:28
Definition: blobs.h:261
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
const FontInfo * fontinfo
Definition: pageres.h:288
const UNICHARSET * unicharset() const
Definition: ratngs.h:297
inT16 fontinfo_id() const
Definition: ratngs.h:85
float certainty() const
Definition: ratngs.h:327
TWERD * rebuild_word
Definition: pageres.h:244
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool is_italic() const
Definition: fontinfo.h:111
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:526
int UNICHAR_ID
Definition: unichar.h:33
inT16 fontinfo_id2() const
Definition: ratngs.h:88
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
inT16 height() const
Definition: rect.h:104
const T & get(int id) const
Return the object from an id.
bool top_bottom_useful() const
Definition: unicharset.h:495
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
int size() const
Return the size used.
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:742
TBOX bounding_box() const
Definition: blobs.cpp:482
Pix* tesseract::Tesseract::BestPix ( ) const
inline

Definition at line 212 of file tesseractclass.h.

212  {
213  return pix_grey_ != NULL ? pix_grey_ : pix_binary_;
214  }
#define NULL
Definition: host.h:144
void tesseract::Tesseract::bigram_correction_pass ( PAGE_RES page_res)

Definition at line 442 of file control.cpp.

442  {
443  PAGE_RES_IT word_it(page_res);
444 
445  WERD_RES *w_prev = NULL;
446  WERD_RES *w = word_it.word();
447  while (1) {
448  w_prev = w;
449  while (word_it.forward() != NULL &&
450  (!word_it.word() || word_it.word()->part_of_combo)) {
451  // advance word_it, skipping over parts of combos
452  }
453  if (!word_it.word()) break;
454  w = word_it.word();
455  if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
456  continue;
457  }
458  if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
459  if (tessedit_bigram_debug) {
460  tprintf("Skipping because one of the words is W_REP_CHAR\n");
461  }
462  continue;
463  }
464  // Two words sharing the same language model, excellent!
465  GenericVector<WERD_CHOICE *> overrides_word1;
466  GenericVector<WERD_CHOICE *> overrides_word2;
467 
468  STRING orig_w1_str = w_prev->best_choice->unichar_string();
469  STRING orig_w2_str = w->best_choice->unichar_string();
470  WERD_CHOICE prev_best(w->uch_set);
471  {
472  int w1start, w1end;
473  w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
474  prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
475  }
476  WERD_CHOICE this_best(w->uch_set);
477  {
478  int w2start, w2end;
479  w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
480  this_best = w->best_choice->shallow_copy(w2start, w2end);
481  }
482 
483  if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
484  if (tessedit_bigram_debug) {
485  tprintf("Top choice \"%s %s\" verified by bigram model.\n",
486  orig_w1_str.string(), orig_w2_str.string());
487  }
488  continue;
489  }
490  if (tessedit_bigram_debug > 2) {
491  tprintf("Examining alt choices for \"%s %s\".\n",
492  orig_w1_str.string(), orig_w2_str.string());
493  }
494  if (tessedit_bigram_debug > 1) {
495  if (!w_prev->best_choices.singleton()) {
496  w_prev->PrintBestChoices();
497  }
498  if (!w->best_choices.singleton()) {
499  w->PrintBestChoices();
500  }
501  }
502  float best_rating = 0.0;
503  int best_idx = 0;
504  WERD_CHOICE_IT prev_it(&w_prev->best_choices);
505  for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
506  WERD_CHOICE *p1 = prev_it.data();
507  WERD_CHOICE strip1(w->uch_set);
508  {
509  int p1start, p1end;
510  p1->GetNonSuperscriptSpan(&p1start, &p1end);
511  strip1 = p1->shallow_copy(p1start, p1end);
512  }
513  WERD_CHOICE_IT w_it(&w->best_choices);
514  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
515  WERD_CHOICE *p2 = w_it.data();
516  WERD_CHOICE strip2(w->uch_set);
517  {
518  int p2start, p2end;
519  p2->GetNonSuperscriptSpan(&p2start, &p2end);
520  strip2 = p2->shallow_copy(p2start, p2end);
521  }
522  if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
523  overrides_word1.push_back(p1);
524  overrides_word2.push_back(p2);
525  if (overrides_word1.size() == 1 ||
526  p1->rating() + p2->rating() < best_rating) {
527  best_rating = p1->rating() + p2->rating();
528  best_idx = overrides_word1.size() - 1;
529  }
530  }
531  }
532  }
533  if (overrides_word1.size() >= 1) {
534  // Excellent, we have some bigram matches.
536  *overrides_word1[best_idx]) &&
538  *overrides_word2[best_idx])) {
539  if (tessedit_bigram_debug > 1) {
540  tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
541  "model.\n", orig_w1_str.string(), orig_w2_str.string());
542  }
543  continue;
544  }
545  STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
546  STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
547  if (new_w1_str != orig_w1_str) {
548  w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
549  }
550  if (new_w2_str != orig_w2_str) {
551  w->ReplaceBestChoice(overrides_word2[best_idx]);
552  }
553  if (tessedit_bigram_debug > 0) {
554  STRING choices_description;
555  int num_bigram_choices
556  = overrides_word1.size() * overrides_word2.size();
557  if (num_bigram_choices == 1) {
558  choices_description = "This was the unique bigram choice.";
559  } else {
560  if (tessedit_bigram_debug > 1) {
561  STRING bigrams_list;
562  const int kMaxChoicesToPrint = 20;
563  for (int i = 0; i < overrides_word1.size() &&
564  i < kMaxChoicesToPrint; i++) {
565  if (i > 0) { bigrams_list += ", "; }
566  WERD_CHOICE *p1 = overrides_word1[i];
567  WERD_CHOICE *p2 = overrides_word2[i];
568  bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
569  if (i == kMaxChoicesToPrint) {
570  bigrams_list += " ...";
571  }
572  }
573  choices_description = "There were many choices: {";
574  choices_description += bigrams_list;
575  choices_description += "}";
576  } else {
577  choices_description.add_str_int("There were ", num_bigram_choices);
578  choices_description += " compatible bigrams.";
579  }
580  }
581  tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
582  orig_w1_str.string(), orig_w2_str.string(),
583  new_w1_str.string(), new_w2_str.string(),
584  choices_description.string());
585  }
586  }
587  }
588 }
int size() const
Definition: genericvector.h:72
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
float rating() const
Definition: ratngs.h:324
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:787
WERD_CHOICE * best_choice
Definition: pageres.h:219
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:392
const STRING & unichar_string() const
Definition: ratngs.h:524
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:375
const UNICHARSET * uch_set
Definition: pageres.h:192
tesseract::Tesseract * tesseract
Definition: pageres.h:266
Dict & getDict()
Definition: classify.h:65
void PrintBestChoices() const
Definition: pageres.cpp:709
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:738
WERD * word
Definition: pageres.h:175
void add_str_int(const char *str, int number)
Definition: strngs.cpp:376
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:791
void tesseract::Tesseract::blamer_pass ( PAGE_RES page_res)

Definition at line 686 of file control.cpp.

686  {
687  if (!wordrec_run_blamer) return;
688  PAGE_RES_IT page_res_it(page_res);
689  for (page_res_it.restart_page(); page_res_it.word() != NULL;
690  page_res_it.forward()) {
691  WERD_RES *word = page_res_it.word();
694  }
695  tprintf("Blame reasons:\n");
696  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
698  static_cast<IncorrectResultReason>(bl)),
699  page_res->blame_reasons[bl]);
700  }
701  if (page_res->misadaption_log.length() > 0) {
702  tprintf("Misadaption log:\n");
703  for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
704  tprintf("%s\n", page_res->misadaption_log[i].string());
705  }
706  }
707 }
int length() const
Definition: genericvector.h:79
#define tprintf(...)
Definition: tprintf.h:31
GenericVector< int > blame_reasons
Definition: pageres.h:68
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
bool wordrec_debug_blamer
Definition: wordrec.h:167
bool wordrec_run_blamer
Definition: wordrec.h:168
GenericVector< STRING > misadaption_log
Definition: pageres.h:73
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:56
WERD * word
Definition: pageres.h:175
#define NULL
Definition: host.h:144
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:547
BlamerBundle * blamer_bundle
Definition: pageres.h:230
void tesseract::Tesseract::blob_feature_display ( PAGE_RES page_res,
const TBOX selection_box 
)

Definition at line 960 of file pgedit.cpp.

961  {
962  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
963  if (it != NULL) {
964  WERD_RES* word_res = it->word();
965  word_res->x_height = it->row()->row->x_height();
966  word_res->SetupForRecognition(unicharset, this, BestPix(),
971  it->row()->row, it->block()->block);
972  TWERD* bln_word = word_res->chopped_word;
973  TBLOB* bln_blob = bln_word->blobs[0];
974  INT_FX_RESULT_STRUCT fx_info;
977  Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features,
978  &cn_features, &fx_info, NULL);
979  // Display baseline features.
980  ScrollView* bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
982  for (int f = 0; f < bl_features.size(); ++f)
983  RenderIntFeature(bl_win, &bl_features[f], ScrollView::GREEN);
984  bl_win->Update();
985  // Display cn features.
986  ScrollView* cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
988  for (int f = 0; f < cn_features.size(); ++f)
989  RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN);
990  cn_win->Update();
991 
992  it->DeleteCurrentWord();
993  delete it;
994  }
995 }
Definition: blobs.h:261
int size() const
Definition: genericvector.h:72
bool classify_bln_numeric_mode
Definition: classify.h:500
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:1104
TWERD * chopped_word
Definition: pageres.h:201
static void Update()
Definition: scrollview.cpp:715
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:31
UNICHARSET unicharset
Definition: ccutil.h:72
float x_height() const
Definition: ocrrow.h:61
void DeleteCurrentWord()
Definition: pageres.cpp:1449
float x_height
Definition: pageres.h:295
BLOCK * block
Definition: pageres.h:99
BLOCK_RES * block() const
Definition: pageres.h:739
bool classify_nonlinear_norm
Definition: classify.h:416
ROW_RES * row() const
Definition: pageres.h:736
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1770
Pix * BestPix() const
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
Definition: intfx.cpp:445
ROW * row
Definition: pageres.h:127
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
#define NULL
Definition: host.h:144
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1936
Definition: blobs.h:395
WERD_RES * word() const
Definition: pageres.h:733
float tesseract::Tesseract::blob_noise_score ( TBLOB blob)

Definition at line 761 of file fixspace.cpp.

761  {
762  TBOX box; // BB of outline
763  inT16 outline_count = 0;
764  inT16 max_dimension;
765  inT16 largest_outline_dimension = 0;
766 
767  for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) {
768  outline_count++;
769  box = ol->bounding_box();
770  if (box.height() > box.width()) {
771  max_dimension = box.height();
772  } else {
773  max_dimension = box.width();
774  }
775 
776  if (largest_outline_dimension < max_dimension)
777  largest_outline_dimension = max_dimension;
778  }
779 
780  if (outline_count > 5) {
781  // penalise LOTS of blobs
782  largest_outline_dimension *= 2;
783  }
784 
785  box = blob->bounding_box();
786  if (box.bottom() > kBlnBaselineOffset * 4 ||
787  box.top() < kBlnBaselineOffset / 2) {
788  // Lax blob is if high or low
789  largest_outline_dimension /= 2;
790  }
791 
792  return largest_outline_dimension;
793 }
const int kBlnBaselineOffset
Definition: normalis.h:29
inT16 bottom() const
Definition: rect.h:61
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111
Definition: rect.h:30
#define NULL
Definition: host.h:144
TBOX bounding_box() const
Definition: blobs.cpp:482
TESSLINE * outlines
Definition: blobs.h:377
inT16 top() const
Definition: rect.h:54
short inT16
Definition: host.h:100
void tesseract::Tesseract::break_noisiest_blob_word ( WERD_RES_LIST &  words)

break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.

Definition at line 616 of file fixspace.cpp.

616  {
617  WERD_RES_IT word_it(&words);
618  WERD_RES_IT worst_word_it;
619  float worst_noise_score = 9999;
620  int worst_blob_index = -1; // Noisiest blob of noisiest wd
621  int blob_index; // of wds noisiest blob
622  float noise_score; // of wds noisiest blob
623  WERD_RES *word_res;
624  C_BLOB_IT blob_it;
625  C_BLOB_IT rej_cblob_it;
626  C_BLOB_LIST new_blob_list;
627  C_BLOB_IT new_blob_it;
628  C_BLOB_IT new_rej_cblob_it;
629  WERD *new_word;
630  inT16 start_of_noise_blob;
631  inT16 i;
632 
633  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
634  blob_index = worst_noise_blob(word_it.data(), &noise_score);
635  if (blob_index > -1 && worst_noise_score > noise_score) {
636  worst_noise_score = noise_score;
637  worst_blob_index = blob_index;
638  worst_word_it = word_it;
639  }
640  }
641  if (worst_blob_index < 0) {
642  words.clear(); // signal termination
643  return;
644  }
645 
646  /* Now split the worst_word_it */
647 
648  word_res = worst_word_it.data();
649 
650  /* Move blobs before noise blob to a new bloblist */
651 
652  new_blob_it.set_to_list(&new_blob_list);
653  blob_it.set_to_list(word_res->word->cblob_list());
654  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
655  new_blob_it.add_after_then_move(blob_it.extract());
656  }
657  start_of_noise_blob = blob_it.data()->bounding_box().left();
658  delete blob_it.extract(); // throw out noise blob
659 
660  new_word = new WERD(&new_blob_list, word_res->word);
661  new_word->set_flag(W_EOL, FALSE);
662  word_res->word->set_flag(W_BOL, FALSE);
663  word_res->word->set_blanks(1); // After break
664 
665  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
666  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
667  for (;
668  (!rej_cblob_it.empty() &&
669  (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
670  rej_cblob_it.forward()) {
671  new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
672  }
673 
674  WERD_RES* new_word_res = new WERD_RES(new_word);
675  new_word_res->combination = TRUE;
676  worst_word_it.add_before_then_move(new_word_res);
677 
678  word_res->ClearResults();
679 }
void ClearResults()
Definition: pageres.cpp:1140
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:681
Definition: werd.h:35
BOOL8 combination
Definition: pageres.h:315
Definition: werd.h:36
Definition: werd.h:60
WERD * word
Definition: pageres.h:175
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
void set_blanks(uinT8 new_blanks)
Definition: werd.h:107
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
short inT16
Definition: host.h:100
SVMenuNode * tesseract::Tesseract::build_menu_new ( )

Definition at line 257 of file pgedit.cpp.

257  {
258  SVMenuNode* parent_menu;
259  SVMenuNode* root_menu_item = new SVMenuNode();
260 
261  SVMenuNode* modes_menu_item = root_menu_item->AddChild("MODES");
262 
263  modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
264  modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
265  modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
266  modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
267  modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
268  modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
269  modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
270  modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
271 
272  parent_menu = root_menu_item->AddChild("DISPLAY");
273 
274  parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, FALSE);
275  parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, FALSE);
276  parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, FALSE);
277  parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, FALSE);
278  parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, FALSE);
279  parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, TRUE);
280  parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
281  parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
282  parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
283  parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
284  parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
285  parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
286  parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
287  parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
288  parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
289 
290 
291  parent_menu = root_menu_item->AddChild("OTHER");
292 
293  parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
294  parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, FALSE);
295  parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, FALSE);
296  parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, FALSE);
297  parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
298  parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
299 
300  return root_menu_item;
301 }
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
SVMenuNode * AddChild(const char *txt)
Definition: svmnode.cpp:59
BOOL8 tesseract::Tesseract::check_debug_pt ( WERD_RES word,
int  location 
)

Definition at line 1767 of file control.cpp.

1767  {
1768  BOOL8 show_map_detail = FALSE;
1769  inT16 i;
1770 
1771  if (!test_pt)
1772  return FALSE;
1773 
1774  tessedit_rejection_debug.set_value (FALSE);
1775  debug_x_ht_level.set_value(0);
1776 
1777  if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
1778  if (location < 0)
1779  return TRUE; // For breakpoint use
1780  tessedit_rejection_debug.set_value (TRUE);
1781  debug_x_ht_level.set_value(2);
1782  tprintf ("\n\nTESTWD::");
1783  switch (location) {
1784  case 0:
1785  tprintf ("classify_word_pass1 start\n");
1786  word->word->print();
1787  break;
1788  case 10:
1789  tprintf ("make_reject_map: initial map");
1790  break;
1791  case 20:
1792  tprintf ("make_reject_map: after NN");
1793  break;
1794  case 30:
1795  tprintf ("classify_word_pass2 - START");
1796  break;
1797  case 40:
1798  tprintf ("classify_word_pass2 - Pre Xht");
1799  break;
1800  case 50:
1801  tprintf ("classify_word_pass2 - END");
1802  show_map_detail = TRUE;
1803  break;
1804  case 60:
1805  tprintf ("fixspace");
1806  break;
1807  case 70:
1808  tprintf ("MM pass START");
1809  break;
1810  case 80:
1811  tprintf ("MM pass END");
1812  break;
1813  case 90:
1814  tprintf ("After Poor quality rejection");
1815  break;
1816  case 100:
1817  tprintf ("unrej_good_quality_words - START");
1818  break;
1819  case 110:
1820  tprintf ("unrej_good_quality_words - END");
1821  break;
1822  case 120:
1823  tprintf ("Write results pass");
1824  show_map_detail = TRUE;
1825  break;
1826  }
1827  if (word->best_choice != NULL) {
1828  tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
1829  word->reject_map.print(debug_fp);
1830  tprintf("\n");
1831  if (show_map_detail) {
1832  tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
1833  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1834  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1835  word->reject_map[i].full_print(debug_fp);
1836  }
1837  }
1838  } else {
1839  tprintf("null best choice\n");
1840  }
1841  tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1842  tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1843  return TRUE;
1844  } else {
1845  return FALSE;
1846  }
1847 }
BOOL8 tess_accepted
Definition: pageres.h:280
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
unsigned char BOOL8
Definition: host.h:113
TBOX bounding_box() const
Definition: werd.cpp:160
const STRING & unichar_string() const
Definition: ratngs.h:524
void full_print(FILE *fp)
Definition: rejctmap.cpp:406
void print()
Definition: werd.cpp:266
BOOL8 done
Definition: pageres.h:282
WERD * word
Definition: pageres.h:175
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
FILE * debug_fp
Definition: tessvars.cpp:24
bool contains(const FCOORD pt) const
Definition: rect.h:323
#define NULL
Definition: host.h:144
void print(FILE *fp)
Definition: rejctmap.cpp:394
const char * string() const
Definition: strngs.cpp:193
Definition: points.h:189
short inT16
Definition: host.h:100
void tesseract::Tesseract::classify_word_and_language ( int  pass_n,
PAGE_RES_IT pr_it,
WordData word_data 
)

Definition at line 1268 of file control.cpp.

1269  {
1270  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
1272  // Best result so far.
1273  PointerVector<WERD_RES> best_words;
1274  // Points to the best result. May be word or in lang_words.
1275  WERD_RES* word = word_data->word;
1276  clock_t start_t = clock();
1278  tprintf("%s word with lang %s at:",
1279  word->done ? "Already done" : "Processing",
1280  most_recently_used_->lang.string());
1281  word->word->bounding_box().print();
1282  }
1283  if (word->done) {
1284  // If done on pass1, leave it as-is.
1285  if (!word->tess_failed)
1286  most_recently_used_ = word->tesseract;
1287  return;
1288  }
1289  int sub = sub_langs_.size();
1290  if (most_recently_used_ != this) {
1291  // Get the index of the most_recently_used_.
1292  for (sub = 0; sub < sub_langs_.size() &&
1293  most_recently_used_ != sub_langs_[sub]; ++sub) {}
1294  }
1295  most_recently_used_->RetryWithLanguage(
1296  *word_data, recognizer, &word_data->lang_words[sub], &best_words);
1297  Tesseract* best_lang_tess = most_recently_used_;
1298  if (!WordsAcceptable(best_words)) {
1299  // Try all the other languages to see if they are any better.
1300  if (most_recently_used_ != this &&
1301  this->RetryWithLanguage(*word_data, recognizer,
1302  &word_data->lang_words[sub_langs_.size()],
1303  &best_words) > 0) {
1304  best_lang_tess = this;
1305  }
1306  for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1307  ++i) {
1308  if (most_recently_used_ != sub_langs_[i] &&
1309  sub_langs_[i]->RetryWithLanguage(*word_data, recognizer,
1310  &word_data->lang_words[i],
1311  &best_words) > 0) {
1312  best_lang_tess = sub_langs_[i];
1313  }
1314  }
1315  }
1316  most_recently_used_ = best_lang_tess;
1317  if (!best_words.empty()) {
1318  if (best_words.size() == 1 && !best_words[0]->combination) {
1319  // Move the best single result to the main word.
1320  word_data->word->ConsumeWordResults(best_words[0]);
1321  } else {
1322  // Words came from LSTM, and must be moved to the PAGE_RES properly.
1323  word_data->word = best_words.back();
1324  pr_it->ReplaceCurrentWord(&best_words);
1325  }
1326  ASSERT_HOST(word_data->word->box_word != NULL);
1327  } else {
1328  tprintf("no best words!!\n");
1329  }
1330  clock_t ocr_t = clock();
1331  if (tessedit_timing_debug) {
1332  tprintf("%s (ocr took %.2f sec)\n",
1333  word->best_choice->unichar_string().string(),
1334  static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
1335  }
1336 }
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:869
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1488
WERD_CHOICE * best_choice
Definition: pageres.h:219
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1321
#define tprintf(...)
Definition: tprintf.h:31
void print() const
Definition: rect.h:270
TBOX bounding_box() const
Definition: werd.cpp:160
#define ASSERT_HOST(x)
Definition: errcode.h:84
const STRING & unichar_string() const
Definition: ratngs.h:524
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1344
tesseract::Tesseract * tesseract
Definition: pageres.h:266
BOOL8 done
Definition: pageres.h:282
WERD * word
Definition: pageres.h:175
BOOL8 tess_failed
Definition: pageres.h:272
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
STRING lang
Definition: ccutil.h:69
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void tesseract::Tesseract::classify_word_pass1 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass1

Baseline normalize the word and pass it to Tess.

Definition at line 1344 of file control.cpp.

1346  {
1347  ROW* row = word_data.row;
1348  BLOCK* block = word_data.block;
1349  prev_word_best_choice_ = word_data.prev_word != NULL
1350  ? word_data.prev_word->word->best_choice : NULL;
1351 #ifndef ANDROID_BUILD
1352  // If we only intend to run cube - run it and return.
1354  cube_word_pass1(block, row, *in_word);
1355  return;
1356  }
1357 #endif
1358  WERD_RES* word = *in_word;
1359  match_word_pass_n(1, word, row, block);
1360  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1361  word->tess_would_adapt = AdaptableWord(word);
1362  bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1363 
1364  if (adapt_ok) {
1365  // Send word to adaptive classifier for training.
1366  word->BestChoiceToCorrectText();
1367  LearnWord(NULL, word);
1368  // Mark misadaptions if running blamer.
1369  if (word->blamer_bundle != NULL) {
1372  }
1373  }
1374 
1375  if (tessedit_enable_doc_dict && !word->IsAmbiguous())
1377  }
1378 }
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1549
BOOL8 word_adaptable(WERD_RES *word, uinT16 mode)
Definition: adaptions.cpp:45
WERD_CHOICE * best_choice
Definition: pageres.h:219
BOOL8 tess_would_adapt
Definition: pageres.h:281
Definition: ocrrow.h:32
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:416
bool wordrec_debug_blamer
Definition: wordrec.h:167
Definition: ocrblock.h:30
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:574
void cube_word_pass1(BLOCK *block, ROW *row, WERD_RES *word)
WERD * word
Definition: pageres.h:175
BOOL8 tess_failed
Definition: pageres.h:272
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:850
#define NULL
Definition: host.h:144
bool IsAmbiguous()
Definition: pageres.cpp:443
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
void BestChoiceToCorrectText()
Definition: pageres.cpp:917
BlamerBundle * blamer_bundle
Definition: pageres.h:230
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:79
void tesseract::Tesseract::classify_word_pass2 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass2

Control what to do with the word in pass 2

Definition at line 1488 of file control.cpp.

1490  {
1491  // Return if we do not want to run Tesseract.
1494  word_data.word->best_choice != NULL)
1495  return;
1497  return;
1498  }
1499  ROW* row = word_data.row;
1500  BLOCK* block = word_data.block;
1501  WERD_RES* word = *in_word;
1502  prev_word_best_choice_ = word_data.prev_word != NULL
1503  ? word_data.prev_word->word->best_choice : NULL;
1504 
1506  check_debug_pt(word, 30);
1507  if (!word->done) {
1508  word->caps_height = 0.0;
1509  if (word->x_height == 0.0f)
1510  word->x_height = row->x_height();
1511  match_word_pass_n(2, word, row, block);
1512  check_debug_pt(word, 40);
1513  }
1514 
1515  SubAndSuperscriptFix(word);
1516 
1517  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1519  block->classify_rotation().y() == 0.0f) {
1520  // Use the tops and bottoms since they are available.
1521  TrainedXheightFix(word, block, row);
1522  }
1523 
1525  }
1526 #ifndef GRAPHICS_DISABLED
1528  if (fx_win == NULL)
1529  create_fx_win();
1530  clear_fx_win();
1531  word->rebuild_word->plot(fx_win);
1532  TBOX wbox = word->rebuild_word->bounding_box();
1533  fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1534  wbox.right(), wbox.bottom());
1536  }
1537 #endif
1539  check_debug_pt(word, 50);
1540 }
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1402
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1549
bool SubAndSuperscriptFix(WERD_RES *word_res)
static void Update()
Definition: scrollview.cpp:715
UNICHARSET unicharset
Definition: ccutil.h:72
void clear_fx_win()
Definition: drawfx.cpp:73
float caps_height
Definition: pageres.h:296
float x_height() const
Definition: ocrrow.h:61
inT16 right() const
Definition: rect.h:75
float x_height
Definition: pageres.h:295
Definition: ocrrow.h:32
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:416
void plot(ScrollView *window)
Definition: blobs.cpp:918
FCOORD classify_rotation() const
Definition: ocrblock.h:144
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1767
inT16 left() const
Definition: rect.h:68
TWERD * rebuild_word
Definition: pageres.h:244
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:765
bool script_has_xheight() const
Definition: unicharset.h:849
Definition: ocrblock.h:30
EXTERN ScrollView * fx_win
Definition: drawfx.cpp:51
inT16 bottom() const
Definition: rect.h:61
TBOX bounding_box() const
Definition: blobs.cpp:881
BOOL8 done
Definition: pageres.h:282
WERD * word
Definition: pageres.h:175
bool top_bottom_useful() const
Definition: unicharset.h:495
BOOL8 tess_failed
Definition: pageres.h:272
Definition: rect.h:30
float y() const
Definition: points.h:212
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:85
#define NULL
Definition: host.h:144
inT16 top() const
Definition: rect.h:54
#define SUBLOC_NORM
Definition: errcode.h:59
void create_fx_win()
Definition: drawfx.cpp:60
float tesseract::Tesseract::ClassifyBlobAsWord ( int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str,
float *  c2 
)

Definition at line 1232 of file control.cpp.

1233  {
1234  WERD* real_word = pr_it->word()->word;
1235  WERD* word = real_word->ConstructFromSingleBlob(
1236  real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
1237  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1238  // Get a new iterator that points to the new word.
1239  PAGE_RES_IT it(pr_it->page_res);
1240  while (it.word() != word_res && it.word() != NULL) it.forward();
1241  ASSERT_HOST(it.word() == word_res);
1242  WordData wd(it);
1243  // Force full initialization.
1244  SetupWordPassN(1, &wd);
1245  classify_word_and_language(pass_n, &it, &wd);
1246  if (debug_noise_removal) {
1247  tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
1248  wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1249  wd.word->raw_choice->max_x_height());
1250  }
1251  float cert = wd.word->raw_choice->certainty();
1252  float rat = wd.word->raw_choice->rating();
1253  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1254  *best_str = wd.word->raw_choice->unichar_string();
1255  it.DeleteCurrentWord();
1256  pr_it->ResetWordIterator();
1257  return cert;
1258 }
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:137
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:113
void ResetWordIterator()
Definition: pageres.cpp:1532
#define tprintf(...)
Definition: tprintf.h:31
PAGE_RES * page_res
Definition: pageres.h:658
float x_height
Definition: pageres.h:295
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: werd.h:35
Definition: werd.h:36
Definition: werd.h:60
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:171
WERD * word
Definition: pageres.h:175
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1268
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1268
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
#define NULL
Definition: host.h:144
WERD_RES * word() const
Definition: pageres.h:733
float tesseract::Tesseract::ClassifyBlobPlusOutlines ( const GenericVector< bool > &  ok_outlines,
const GenericVector< C_OUTLINE * > &  outlines,
int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str 
)

Definition at line 1190 of file control.cpp.

1193  {
1194  C_OUTLINE_IT ol_it;
1195  C_OUTLINE* first_to_keep = NULL;
1196  if (blob != NULL) {
1197  // Add the required outlines to the blob.
1198  ol_it.set_to_list(blob->out_list());
1199  first_to_keep = ol_it.data();
1200  }
1201  for (int i = 0; i < ok_outlines.size(); ++i) {
1202  if (ok_outlines[i]) {
1203  // This outline is to be added.
1204  if (blob == NULL) {
1205  blob = new C_BLOB(outlines[i]);
1206  ol_it.set_to_list(blob->out_list());
1207  } else {
1208  ol_it.add_before_stay_put(outlines[i]);
1209  }
1210  }
1211  }
1212  float c2;
1213  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1214  ol_it.move_to_first();
1215  if (first_to_keep == NULL) {
1216  // We created blob. Empty its outlines and delete it.
1217  for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1218  delete blob;
1219  cert = -c2;
1220  } else {
1221  // Remove the outlines that we put in.
1222  for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1223  ol_it.extract();
1224  }
1225  }
1226  return cert;
1227 }
int size() const
Definition: genericvector.h:72
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1232
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
#define NULL
Definition: host.h:144
void tesseract::Tesseract::Clear ( )

Definition at line 640 of file tesseractclass.cpp.

640  {
641  pixDestroy(&pix_binary_);
642  pixDestroy(&cube_binary_);
643  pixDestroy(&pix_grey_);
644  pixDestroy(&pix_thresholds_);
645  pixDestroy(&scaled_color_);
646  deskew_ = FCOORD(1.0f, 0.0f);
647  reskew_ = FCOORD(1.0f, 0.0f);
648  splitter_.Clear();
649  scaled_factor_ = -1;
650  for (int i = 0; i < sub_langs_.size(); ++i)
651  sub_langs_[i]->Clear();
652 }
Definition: points.h:189
float tesseract::Tesseract::ComputeCompatibleXheight ( WERD_RES word_res,
float *  baseline_shift 
)

Definition at line 101 of file fixxht.cpp.

102  {
103  STATS top_stats(0, MAX_UINT8);
104  STATS shift_stats(-MAX_UINT8, MAX_UINT8);
105  int bottom_shift = 0;
106  int num_blobs = word_res->rebuild_word->NumBlobs();
107  do {
108  top_stats.clear();
109  shift_stats.clear();
110  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
111  TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
112  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
113  if (unicharset.get_isalpha(class_id) ||
114  unicharset.get_isdigit(class_id)) {
115  int top = blob->bounding_box().top() + bottom_shift;
116  // Clip the top to the limit of normalized feature space.
117  if (top >= INT_FEAT_RANGE)
118  top = INT_FEAT_RANGE - 1;
119  int bottom = blob->bounding_box().bottom() + bottom_shift;
120  int min_bottom, max_bottom, min_top, max_top;
121  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
122  &min_top, &max_top);
123  // Chars with a wild top range would mess up the result so ignore them.
124  if (max_top - min_top > kMaxCharTopRange)
125  continue;
126  int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
127  top - (max_top + x_ht_acceptance_tolerance));
128  int height = top - kBlnBaselineOffset;
129  if (debug_x_ht_level >= 2) {
130  tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
131  unicharset.id_to_unichar(class_id),
132  height, min_bottom, max_bottom, min_top, max_top,
133  bottom, top);
134  }
135  // Use only chars that fit in the expected bottom range, and where
136  // the range of tops is sensibly near the xheight.
137  if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
138  bottom - x_ht_acceptance_tolerance <= max_bottom &&
139  min_top > kBlnBaselineOffset &&
140  max_top - kBlnBaselineOffset >= kBlnXHeight &&
141  misfit_dist > 0) {
142  // Compute the x-height position using proportionality between the
143  // actual height and expected height.
144  int min_xht = DivRounded(height * kBlnXHeight,
145  max_top - kBlnBaselineOffset);
146  int max_xht = DivRounded(height * kBlnXHeight,
147  min_top - kBlnBaselineOffset);
148  if (debug_x_ht_level >= 2) {
149  tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
150  }
151  // The range of expected heights gets a vote equal to the distance
152  // of the actual top from the expected top.
153  for (int y = min_xht; y <= max_xht; ++y)
154  top_stats.add(y, misfit_dist);
155  } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
156  bottom - x_ht_acceptance_tolerance > max_bottom) &&
157  bottom_shift == 0) {
158  // Get the range of required bottom shift.
159  int min_shift = min_bottom - bottom;
160  int max_shift = max_bottom - bottom;
161  if (debug_x_ht_level >= 2) {
162  tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
163  }
164  // The range of expected shifts gets a vote equal to the min distance
165  // of the actual bottom from the expected bottom, spread over the
166  // range of its acceptance.
167  int misfit_weight = abs(min_shift);
168  if (max_shift > min_shift)
169  misfit_weight /= max_shift - min_shift;
170  for (int y = min_shift; y <= max_shift; ++y)
171  shift_stats.add(y, misfit_weight);
172  } else {
173  if (bottom_shift == 0) {
174  // Things with bottoms that are already ok need to say so, on the
175  // 1st iteration only.
176  shift_stats.add(0, kBlnBaselineOffset);
177  }
178  if (debug_x_ht_level >= 2) {
179  tprintf(" already OK\n");
180  }
181  }
182  }
183  }
184  if (shift_stats.get_total() > top_stats.get_total()) {
185  bottom_shift = IntCastRounded(shift_stats.median());
186  if (debug_x_ht_level >= 2) {
187  tprintf("Applying bottom shift=%d\n", bottom_shift);
188  }
189  }
190  } while (bottom_shift != 0 &&
191  top_stats.get_total() < shift_stats.get_total());
192  // Baseline shift is opposite sign to the bottom shift.
193  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
194  if (debug_x_ht_level >= 2) {
195  tprintf("baseline shift=%g\n", *baseline_shift);
196  }
197  if (top_stats.get_total() == 0)
198  return bottom_shift != 0 ? word_res->x_height : 0.0f;
199  // The new xheight is just the median vote, which is then scaled out
200  // of BLN space back to pixel space to get the x-height in pixel space.
201  float new_xht = top_stats.median();
202  if (debug_x_ht_level >= 2) {
203  tprintf("Median xht=%f\n", new_xht);
204  tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
205  new_xht, new_xht / word_res->denorm.y_scale());
206  }
207  // The xheight must change by at least x_ht_min_change to be used.
208  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
209  return new_xht / word_res->denorm.y_scale();
210  else
211  return bottom_shift != 0 ? word_res->x_height : 0.0f;
212 }
const int kBlnXHeight
Definition: normalis.h:28
Definition: blobs.h:261
#define MAX(x, y)
Definition: ndminx.h:24
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
Definition: statistc.h:33
UNICHARSET unicharset
Definition: ccutil.h:72
float x_height
Definition: pageres.h:295
int NumBlobs() const
Definition: blobs.h:425
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
TWERD * rebuild_word
Definition: pageres.h:244
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
DENORM denorm
Definition: pageres.h:190
const int kBlnBaselineOffset
Definition: normalis.h:29
#define MAX_UINT8
Definition: host.h:121
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:526
int UNICHAR_ID
Definition: unichar.h:33
inT16 bottom() const
Definition: rect.h:61
int DivRounded(int a, int b)
Definition: helpers.h:166
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
const int kMaxCharTopRange
Definition: fixxht.cpp:66
int IntCastRounded(double x)
Definition: helpers.h:172
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
float y_scale() const
Definition: normalis.h:272
#define INT_FEAT_RANGE
Definition: float2int.h:27
TBOX bounding_box() const
Definition: blobs.cpp:482
inT16 top() const
Definition: rect.h:54
void tesseract::Tesseract::convert_bad_unlv_chs ( WERD_RES word_res)

Definition at line 663 of file docqual.cpp.

663  {
664  int i;
665  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
666  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
667  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
668  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
669  for (i = 0; i < word_res->reject_map.length(); ++i) {
670  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
671  word_res->best_choice->set_unichar_id(unichar_dash, i);
672  if (word_res->reject_map[i].accepted ())
673  word_res->reject_map[i].setrej_unlv_rej ();
674  }
675  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
676  word_res->best_choice->set_unichar_id(unichar_space, i);
677  if (word_res->reject_map[i].accepted ())
678  word_res->reject_map[i].setrej_unlv_rej ();
679  }
680  }
681 }
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:356
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
inT32 length() const
Definition: rejctmap.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const UNICHARSET * uch_set
Definition: pageres.h:192
int UNICHAR_ID
Definition: unichar.h:33
bool tesseract::Tesseract::ConvertStringToUnichars ( const char *  utf8,
GenericVector< UNICHAR_ID > *  class_ids 
)

Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.

Returns
false if an invalid UNICHAR_ID is encountered.

Definition at line 535 of file applybox.cpp.

536  {
537  for (int step = 0; *utf8 != '\0'; utf8 += step) {
538  const char* next_space = strchr(utf8, ' ');
539  if (next_space == NULL)
540  next_space = utf8 + strlen(utf8);
541  step = next_space - utf8;
542  UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
543  if (class_id == INVALID_UNICHAR_ID) {
544  return false;
545  }
546  while (utf8[step] == ' ')
547  ++step;
548  class_ids->push_back(class_id);
549  }
550  return true;
551 }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
int push_back(T object)
UNICHARSET unicharset
Definition: ccutil.h:72
int UNICHAR_ID
Definition: unichar.h:33
#define NULL
Definition: host.h:144
void tesseract::Tesseract::CorrectClassifyWords ( PAGE_RES page_res)

Creates a fake best_choice entry in each WERD_RES with the correct text.

Definition at line 772 of file applybox.cpp.

772  {
773  PAGE_RES_IT pr_it(page_res);
774  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
775  word_res = pr_it.forward()) {
776  WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
777  word_res->correct_text.size());
778  for (int i = 0; i < word_res->correct_text.size(); ++i) {
779  // The part before the first space is the real ground truth, and the
780  // rest is the bounding box location and page number.
781  GenericVector<STRING> tokens;
782  word_res->correct_text[i].split(' ', &tokens);
783  UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
784  choice->append_unichar_id_space_allocated(char_id,
785  word_res->best_state[i],
786  0.0f, 0.0f);
787  }
788  word_res->ClearWordChoices();
789  word_res->LogNewRawChoice(choice);
790  word_res->LogNewCookedChoice(1, false, choice);
791  }
792 }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:449
UNICHARSET unicharset
Definition: ccutil.h:72
int UNICHAR_ID
Definition: unichar.h:33
WERD * word
Definition: pageres.h:175
#define NULL
Definition: host.h:144
inT16 tesseract::Tesseract::count_alphanums ( const WERD_CHOICE word)

Definition at line 410 of file output.cpp.

410  {
411  int count = 0;
412  for (int i = 0; i < word.length(); ++i) {
413  if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
414  word.unicharset()->get_isdigit(word.unichar_id(i)))
415  count++;
416  }
417  return count;
418 }
int length() const
Definition: ratngs.h:300
const UNICHARSET * unicharset() const
Definition: ratngs.h:297
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int count(LIST var_list)
Definition: oldlist.cpp:108
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
inT16 tesseract::Tesseract::count_alphanums ( WERD_RES word)

Definition at line 558 of file reject.cpp.

558  {
559  int count = 0;
560  const WERD_CHOICE *best_choice = word_res->best_choice;
561  for (int i = 0; i < word_res->reject_map.length(); ++i) {
562  if ((word_res->reject_map[i].accepted()) &&
563  (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
564  word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
565  count++;
566  }
567  }
568  return count;
569 }
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int count(LIST var_list)
Definition: oldlist.cpp:108
inT16 tesseract::Tesseract::count_alphas ( const WERD_CHOICE word)

Definition at line 400 of file output.cpp.

400  {
401  int count = 0;
402  for (int i = 0; i < word.length(); ++i) {
403  if (word.unicharset()->get_isalpha(word.unichar_id(i)))
404  count++;
405  }
406  return count;
407 }
int length() const
Definition: ratngs.h:300
const UNICHARSET * unicharset() const
Definition: ratngs.h:297
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int count(LIST var_list)
Definition: oldlist.cpp:108
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
inT16 tesseract::Tesseract::count_outline_errs ( char  c,
inT16  outline_count 
)

Definition at line 128 of file docqual.cpp.

128  {
129  int expected_outline_count;
130 
131  if (STRING (outlines_odd).contains (c))
132  return 0; //Dont use this char
133  else if (STRING (outlines_2).contains (c))
134  expected_outline_count = 2;
135  else
136  expected_outline_count = 1;
137  return abs (outline_count - expected_outline_count);
138 }
Definition: strngs.h:44
int tesseract::Tesseract::CountMisfitTops ( WERD_RES word_res)

Definition at line 69 of file fixxht.cpp.

69  {
70  int bad_blobs = 0;
71  int num_blobs = word_res->rebuild_word->NumBlobs();
72  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
73  TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
74  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
75  if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
76  int top = blob->bounding_box().top();
77  if (top >= INT_FEAT_RANGE)
78  top = INT_FEAT_RANGE - 1;
79  int min_bottom, max_bottom, min_top, max_top;
80  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
81  &min_top, &max_top);
82  if (max_top - min_top > kMaxCharTopRange)
83  continue;
84  bool bad = top < min_top - x_ht_acceptance_tolerance ||
85  top > max_top + x_ht_acceptance_tolerance;
86  if (bad)
87  ++bad_blobs;
88  if (debug_x_ht_level >= 1) {
89  tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
90  unicharset.id_to_unichar(class_id),
91  bad ? "Misfit" : "OK", top, min_top, max_top,
92  static_cast<int>(x_ht_acceptance_tolerance));
93  }
94  }
95  }
96  return bad_blobs;
97 }
Definition: blobs.h:261
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
int NumBlobs() const
Definition: blobs.h:425
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
TWERD * rebuild_word
Definition: pageres.h:244
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:526
int UNICHAR_ID
Definition: unichar.h:33
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
const int kMaxCharTopRange
Definition: fixxht.cpp:66
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
#define INT_FEAT_RANGE
Definition: float2int.h:27
TBOX bounding_box() const
Definition: blobs.cpp:482
inT16 top() const
Definition: rect.h:54
bool tesseract::Tesseract::create_cube_box_word ( Boxa *  char_boxes,
int  num_chars,
TBOX  word_box,
BoxWord box_word 
)

Definition at line 116 of file cube_control.cpp.

119  {
120  if (!box_word) {
121  if (cube_debug_level > 0) {
122  tprintf("Cube WARNING (create_cube_box_word): Invalid box_word.\n");
123  }
124  return false;
125  }
126 
127  // Find the x-coordinate of left-most char_box, which could be
128  // nonzero if the word image was padded before recognition took place.
129  int x_offset = -1;
130  for (int i = 0; i < num_chars; ++i) {
131  Box* char_box = boxaGetBox(char_boxes, i, L_CLONE);
132  if (x_offset < 0 || char_box->x < x_offset) {
133  x_offset = char_box->x;
134  }
135  boxDestroy(&char_box);
136  }
137 
138  for (int i = 0; i < num_chars; ++i) {
139  Box* char_box = boxaGetBox(char_boxes, i, L_CLONE);
140  TBOX tbox = char_box_to_tbox(char_box, word_box, x_offset);
141  boxDestroy(&char_box);
142  box_word->InsertBox(i, tbox);
143  }
144  return true;
145 }
TBOX char_box_to_tbox(Box *char_box, TBOX word_box, int x_offset)
#define tprintf(...)
Definition: tprintf.h:31
Definition: rect.h:30
void tesseract::Tesseract::cube_combine_word ( CubeObject cube_obj,
WERD_RES cube_word,
WERD_RES tess_word 
)

Definition at line 283 of file cube_control.cpp.

284  {
285  float combiner_prob = tess_cube_combiner_->CombineResults(tess_word,
286  cube_obj);
287  // If combiner probability is greater than tess/cube combiner
288  // classifier threshold, i.e. tesseract wins, then just return the
289  // tesseract result unchanged, as the combiner knows nothing about how
290  // correct the answer is. If cube and tesseract agree, then improve the
291  // scores before returning.
292  WERD_CHOICE* tess_best = tess_word->best_choice;
293  WERD_CHOICE* cube_best = cube_word->best_choice;
295  tprintf("Combiner prob = %g vs threshold %g\n",
296  combiner_prob, cube_cntxt_->Params()->CombinerClassifierThresh());
297  }
298  if (combiner_prob >=
299  cube_cntxt_->Params()->CombinerClassifierThresh()) {
300  if (tess_best->unichar_string() == cube_best->unichar_string()) {
301  // Cube and tess agree, so improve the scores.
302  tess_best->set_rating(tess_best->rating() / 2);
303  tess_best->set_certainty(tess_best->certainty() / 2);
304  }
305  return;
306  }
307  // Cube wins.
308  // It is better for the language combiner to have all tesseract scores,
309  // so put them in the cube result.
310  cube_best->set_rating(tess_best->rating());
311  cube_best->set_certainty(tess_best->certainty());
313  tprintf("Cube INFO: tesseract result replaced by cube: %s -> %s\n",
314  tess_best->unichar_string().string(),
315  cube_best->unichar_string().string());
316  }
317  tess_word->ConsumeWordResults(cube_word);
318 }
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:757
float rating() const
Definition: ratngs.h:324
double CombinerClassifierThresh() const
Definition: tuning_params.h:63
void set_certainty(float new_val)
Definition: ratngs.h:369
float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj)
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31