39 #include "allheaders.h"
52 #include "config_auto.h"
59 "Take segmentation and labeling from box file",
61 BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
62 "Conversion of word/line box file to char box file",
65 "Generate training data from boxed chars", this->params()),
67 "Generate more boxes from boxed chars", this->params()),
69 "Dump intermediate images made during page segmentation",
75 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
76 " 5=line, 6=word, 7=char"
77 " (Values from PageSegMode enum in publictypes.h)",
80 "Which OCR engine(s) to run (Tesseract, Cube, both)."
81 " Defaults to loading and running only Tesseract"
82 " (no Cube,no combiner)."
83 " Values from OcrEngineMode enum in tesseractclass.h)",
86 "Blacklist of chars not to recognize", this->params()),
88 "Whitelist of chars to recognize", this->params()),
90 "List of chars to override tessedit_char_blacklist",
93 "Perform training for ambiguities", this->params()),
96 "Whether to use the top-line splitting process for Devanagari "
97 "documents while performing page-segmentation.",
101 "Whether to use the top-line splitting process for Devanagari "
102 "documents while performing ocr.",
105 "Write all parameters to the given file.", this->params()),
107 "Generate and print debug"
108 " information for adaption",
110 INT_MEMBER(bidi_debug, 0,
"Debug level for BiDi", this->params()),
111 INT_MEMBER(applybox_debug, 1,
"Debug level", this->params()),
112 INT_MEMBER(applybox_page, 0,
"Page number to apply boxes from",
115 "Exposure value follows"
116 " this pattern in the image filename. The name of the image"
117 " files are expected to be in the form"
118 " [lang].[fontname].exp[num].tif",
120 BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
121 "Learn both character fragments (as is done in the"
122 " special low exposure mode) as well as unfragmented"
127 " is assumed to contain ngrams. Only learn the ngrams"
128 " whose outlines overlap horizontally.",
130 BOOL_MEMBER(tessedit_display_outwords, false,
"Draw output words",
132 BOOL_MEMBER(tessedit_dump_choices, false,
"Dump char choices",
134 BOOL_MEMBER(tessedit_timing_debug, false,
"Print timing stats",
137 "Try to improve fuzzy spaces", this->params()),
139 "Dont bother with word plausibility", this->params()),
140 BOOL_MEMBER(tessedit_fix_hyphens, true,
"Crunch double hyphens?",
142 BOOL_MEMBER(tessedit_redo_xheight, true,
"Check/Correct x-height",
145 "Add words to the document dictionary", this->params()),
146 BOOL_MEMBER(tessedit_debug_fonts, false,
"Output font info per char",
148 BOOL_MEMBER(tessedit_debug_block_rejection, false,
"Block and Row stats",
150 BOOL_MEMBER(tessedit_enable_bigram_correction, true,
151 "Enable correction based on the word bigram dictionary.",
153 BOOL_MEMBER(tessedit_enable_dict_correction, false,
154 "Enable single word correction based on the dictionary.",
157 "Amount of debug output for bigram correction.",
160 "Remove and conditionally reassign small outlines when they"
161 " confuse layout analysis, determining diacritics vs noise",
163 INT_MEMBER(debug_noise_removal, 0,
"Debug reassignment of small outlines",
169 "Hingepoint for base char certainty", this->params()),
173 "Hingepoint for disjoint certainty", this->params()),
177 "Threshold for new punc char certainty", this->params()),
180 "Scaling on certainty diff from Hingepoint",
182 INT_MEMBER(noise_maxperblob, 8,
"Max diacritics to apply to a blob",
184 INT_MEMBER(noise_maxperword, 16,
"Max diacritics to apply to a word",
186 INT_MEMBER(debug_x_ht_level, 0,
"Reestimate debug", this->params()),
187 BOOL_MEMBER(debug_acceptable_wds, false,
"Dump word pass/fail chk",
189 STRING_MEMBER(chs_leading_punct,
"('`\"",
"Leading punctuation",
191 STRING_MEMBER(chs_trailing_punct1,
").,;:?!",
"1st Trailing punctuation",
193 STRING_MEMBER(chs_trailing_punct2,
")'`\"",
"2nd Trailing punctuation",
196 "good_quality_doc lte rejection limit", this->params()),
198 "good_quality_doc gte good blobs limit", this->params()),
200 "good_quality_doc lte outline error limit", this->params()),
202 "good_quality_doc gte good char limit", this->params()),
203 INT_MEMBER(quality_min_initial_alphas_reqd, 2,
"alphas in a good word",
206 "Adaptation decision algorithm for tess", this->params()),
208 "Do minimal rejection on pass 1 output", this->params()),
209 BOOL_MEMBER(tessedit_test_adaption, false,
"Test adaption criteria",
211 BOOL_MEMBER(tessedit_matcher_log, false,
"Log matcher activity",
214 "Adaptation decision algorithm for tess", this->params()),
215 BOOL_MEMBER(test_pt, false,
"Test for point", this->params()),
216 double_MEMBER(test_pt_x, 99999.99,
"xcoord", this->params()),
217 double_MEMBER(test_pt_y, 99999.99,
"ycoord", this->params()),
218 INT_MEMBER(paragraph_debug_level, 0,
"Print paragraph debug info.",
221 "Run paragraph detection on the post-text-recognition "
224 INT_MEMBER(cube_debug_level, 0,
"Print cube debug info.", this->params()),
225 STRING_MEMBER(outlines_odd,
"%| ",
"Non standard number of outlines",
227 STRING_MEMBER(outlines_2,
"ij!?%\":;",
"Non standard number of outlines",
230 "Allow outline errs in unrejection?", this->params()),
232 "Reduce rejection on good docs", this->params()),
233 BOOL_MEMBER(tessedit_use_reject_spaces, true,
"Reject spaces?",
236 "%rej allowed before rej whole doc", this->params()),
238 "%rej allowed before rej whole block", this->params()),
240 "%rej allowed before rej whole row", this->params()),
242 "Number of row rejects in whole word rejects"
243 "which prevents whole row rejection",
245 BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
246 "Only rej partially rejected words in block rejection",
248 BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
249 "Only rej partially rejected words in row rejection",
252 "Use word segmentation quality metric", this->params()),
254 "Use word segmentation quality metric", this->params()),
256 "Only preserve wds longer than this", this->params()),
258 "Apply row rejection to good docs", this->params()),
260 "rej good doc wd if more than this fraction rejected",
263 "Reject all bad quality wds", this->params()),
264 BOOL_MEMBER(tessedit_debug_doc_rejection, false,
"Page stats",
267 "Output data to debug file", this->params()),
268 BOOL_MEMBER(bland_unrej, false,
"unrej potential with no chekcs",
271 "good_quality_doc gte good char limit", this->params()),
273 "Mark v.bad words for tilde crunch", this->params()),
274 BOOL_MEMBER(hocr_font_info, false,
"Add font info to hocr output",
276 BOOL_MEMBER(crunch_early_merge_tess_fails, true,
"Before word crunch?",
278 BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
279 "Take out ~^ early?", this->params()),
280 double_MEMBER(crunch_terrible_rating, 80.0,
"crunch rating lt this",
282 BOOL_MEMBER(crunch_terrible_garbage, true,
"As it says", this->params()),
284 "crunch garbage cert lt this", this->params()),
286 "crunch garbage rating lt this", this->params()),
287 double_MEMBER(crunch_pot_poor_rate, 40,
"POTENTIAL crunch rating lt this",
289 double_MEMBER(crunch_pot_poor_cert, -8.0,
"POTENTIAL crunch cert lt this",
291 BOOL_MEMBER(crunch_pot_garbage, true,
"POTENTIAL crunch garbage",
293 double_MEMBER(crunch_del_rating, 60,
"POTENTIAL crunch rating lt this",
295 double_MEMBER(crunch_del_cert, -10.0,
"POTENTIAL crunch cert lt this",
297 double_MEMBER(crunch_del_min_ht, 0.7,
"Del if word ht lt xht x this",
299 double_MEMBER(crunch_del_max_ht, 3.0,
"Del if word ht gt xht x this",
302 "Del if word width lt xht x this", this->params()),
304 "Del if word gt xht x this above bl", this->params()),
306 "Del if word gt xht x this below bl", this->params()),
307 double_MEMBER(crunch_small_outlines_size, 0.6,
"Small if lt xht x this",
309 INT_MEMBER(crunch_rating_max, 10,
"For adj length in rating per ch",
312 "How many potential indicators needed", this->params()),
313 BOOL_MEMBER(crunch_leave_ok_strings, true,
"Dont touch sensible strings",
315 BOOL_MEMBER(crunch_accept_ok, true,
"Use acceptability in okstring",
318 "Dont pot crunch sensible strings", this->params()),
319 BOOL_MEMBER(crunch_include_numerals, false,
"Fiddle alpha figures",
322 "Dont crunch words with long lower case strings",
325 "Dont crunch words with long lower case strings",
328 "Crunch words with long repetitions", this->params()),
329 INT_MEMBER(crunch_debug, 0,
"As it says", this->params()),
331 "How many non-noise blbs either side?", this->params()),
332 double_MEMBER(fixsp_small_outlines_size, 0.28,
"Small if lt xht x this",
335 "Reward punctation joins", this->params()),
336 INT_MEMBER(fixsp_done_mode, 1,
"What constitues done for spacing",
338 INT_MEMBER(debug_fix_space_level, 0,
"Contextual fixspace debug",
341 "Punct. chs expected WITHIN numbers", this->params()),
343 "Max allowed deviation of blob top outside of font data",
346 "Min change in xht before actually trying it", this->params()),
348 "Debug level for sub & superscript fixer", this->params()),
350 superscript_worse_certainty, 2.0,
351 "How many times worse "
352 "certainty does a superscript position glyph need to be for "
353 "us to try classifying it as a char with a different "
357 superscript_bettered_certainty, 0.97,
359 "badness do we think sufficient to choose a superscript "
360 "over what we'd thought. For example, a value of 0.6 means "
361 "we want to reduce badness of certainty by at least 40%",
364 "A superscript scaled down more than this is unbelievably "
365 "small. For example, 0.3 means we expect the font size to "
366 "be no smaller than 30% of the text line font size.",
369 "Maximum top of a character measured as a multiple of "
370 "x-height above the baseline for us to reconsider whether "
374 "Minimum bottom of a character measured as a multiple of "
375 "x-height above the baseline for us to reconsider whether "
376 "it's a superscript.",
378 BOOL_MEMBER(tessedit_write_block_separators, false,
379 "Write block separators in output", this->params()),
380 BOOL_MEMBER(tessedit_write_rep_codes, false,
"Write repetition char code",
382 BOOL_MEMBER(tessedit_write_unlv, false,
"Write .unlv output file",
384 BOOL_MEMBER(tessedit_create_txt, true,
"Write .txt output file",
386 BOOL_MEMBER(tessedit_create_hocr, false,
"Write .html hOCR output file",
388 BOOL_MEMBER(tessedit_create_pdf, false,
"Write .pdf output file",
391 "Output char for unidentified blobs", this->params()),
392 INT_MEMBER(suspect_level, 99,
"Suspect marker level", this->params()),
394 "Min suspect level for rejecting spaces", this->params()),
396 "Dont Suspect dict wds longer than this", this->params()),
397 BOOL_MEMBER(suspect_constrain_1Il, false,
"UNLV keep 1Il chars rejected",
399 double_MEMBER(suspect_rating_per_ch, 999.9,
"Dont touch bad rating limit",
401 double_MEMBER(suspect_accept_rating, -999.9,
"Accept good rating limit",
404 "Only reject tess failures", this->params()),
405 BOOL_MEMBER(tessedit_zero_rejection, false,
"Dont reject ANYTHING",
408 "Make output have exactly one word per WERD", this->params()),
410 "Dont reject ANYTHING AT ALL", this->params()),
412 "Force all rep chars the same", this->params()),
413 INT_MEMBER(tessedit_reject_mode, 0,
"Rejection algorithm",
415 BOOL_MEMBER(tessedit_rejection_debug, false,
"Adaption debug",
417 BOOL_MEMBER(tessedit_flip_0O, true,
"Contextual 0O O0 flips",
420 "Aspect ratio dot/hyphen test", this->params()),
422 "Aspect ratio dot/hyphen test", this->params()),
424 "Use DOC dawg in 11l conf. detector", this->params()),
425 BOOL_MEMBER(rej_1Il_use_dict_word, false,
"Use dictword test",
427 BOOL_MEMBER(rej_1Il_trust_permuter_type, true,
"Dont double check",
429 BOOL_MEMBER(rej_use_tess_accepted, true,
"Individual rejection control",
431 BOOL_MEMBER(rej_use_tess_blanks, true,
"Individual rejection control",
433 BOOL_MEMBER(rej_use_good_perm, true,
"Individual rejection control",
435 BOOL_MEMBER(rej_use_sensible_wd, false,
"Extend permuter check",
437 BOOL_MEMBER(rej_alphas_in_number_perm, false,
"Extend permuter check",
440 "if >this fract", this->params()),
441 INT_MEMBER(tessedit_image_border, 2,
"Rej blbs near image edge limit",
444 "Allow NN to unrej", this->params()),
445 STRING_MEMBER(conflict_set_I_l_1,
"Il1[]",
"Il1 conflict set",
447 INT_MEMBER(min_sane_x_ht_pixels, 8,
"Reject any x-ht lt or eq than this",
449 BOOL_MEMBER(tessedit_create_boxfile, false,
"Output text with boxes",
453 " , else specifc page to process",
456 "Capture the image from the IPE", this->params()),
457 BOOL_MEMBER(interactive_display_mode, false,
"Run interactively?",
459 STRING_MEMBER(file_type,
".tif",
"Filename extension", this->params()),
460 BOOL_MEMBER(tessedit_override_permuter, true,
"According to dict_word",
464 " TessdataManager functions.",
467 "List of languages to load with this one", this->params()),
468 BOOL_MEMBER(tessedit_use_primary_params_model, false,
469 "In multilingual mode use params model of the"
473 "Min acceptable orientation margin", this->params()),
474 BOOL_MEMBER(textord_tabfind_show_vlines, false,
"Debug line finding",
479 "Allow feature extractors to see the original outline",
482 "Only initialize with the config file. Useful if the "
483 "instance is not going to be used for OCR but say only "
484 "for layout analysis.",
486 BOOL_MEMBER(textord_equation_detect, false,
"Turn on equation detector",
489 "Enable vertical detection", this->params()),
490 BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
491 "Force using vertical text page mode", this->params()),
493 textord_tabfind_vertical_text_ratio, 0.5,
494 "Fraction of textlines deemed vertical to use vertical page "
498 textord_tabfind_aligned_gap_fraction, 0.75,
499 "Fraction of height used as a minimum gap for aligned blobs.",
501 INT_MEMBER(tessedit_parallelize, 0,
"Run in parallel where possible",
504 "Preserve multiple interword spaces", this->params()),
506 "Include page separator string in output text after each "
510 "Page separator (default is form feed control character)",
522 BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true,
523 "find horizontal lines such as headers in vertical page mode",
525 INT_MEMBER(tessedit_ok_mode, 5,
"Acceptance decision algorithm",
528 "Load fixed length dawgs"
529 " (e.g. for non-space delimited languages)",
531 INT_MEMBER(segment_debug, 0,
"Debug the whole segmentation process",
533 BOOL_MEMBER(permute_debug, 0,
"Debug char permutation process",
536 "Multiplying factor of"
537 " current best rate to prune other hypotheses",
540 "Turn on word script consistency permuter", this->params()),
542 "incorporate segmentation cost in word rating?",
545 "Score multipler for script consistency within a word. "
546 "Being a 'reward' factor, it should be <= 1. "
547 "Smaller value implies bigger reward.",
550 "Turn on fixed-length phrasebook search permuter",
553 "Turn on character type (property) consistency permuter",
556 "Score multipler for char type consistency within a word. ",
559 "Score multipler for ngram permuter's best choice"
560 " (only used in the Han script path).",
563 "Activate character-level n-gram-based permuter",
565 BOOL_MEMBER(permute_only_top, false,
"Run only the top choice permuter",
567 INT_MEMBER(language_model_fixed_length_choices_depth, 3,
568 "Depth of blob choice lists to explore"
569 " when fixed length dawgs are on",
572 "use new state cost heuristics for segmentation state"
576 "base factor for adding segmentation cost into word rating."
577 "It's a multiplying factor, the larger the value above 1, "
578 "the bigger the effect of segmentation cost.",
581 "weight associated with char rating in combined cost of"
585 "weight associated with width evidence in combined cost of"
589 "weight associated with seam cut in combined cost of state",
592 "max char width-to-height ratio allowed in segmentation",
595 "Enable new segmentation search path.", this->params()),
597 "Maximum character width-to-height ratio for"
598 " fixed-pitch fonts",
602 backup_config_file_(
NULL),
606 pix_thresholds_(
NULL),
607 source_resolution_(0),
609 right_to_left_(false),
614 most_recently_used_(this),
616 #ifndef ANDROID_BUILD
618 tess_cube_combiner_(
NULL),
626 sub_langs_.delete_data_pointers();
627 #ifndef ANDROID_BUILD
629 if (cube_cntxt_ !=
NULL) {
633 if (tess_cube_combiner_ !=
NULL) {
634 delete tess_cube_combiner_;
635 tess_cube_combiner_ =
NULL;
641 pixDestroy(&pix_binary_);
642 pixDestroy(&cube_binary_);
643 pixDestroy(&pix_grey_);
644 pixDestroy(&pix_thresholds_);
645 pixDestroy(&scaled_color_);
646 deskew_ =
FCOORD(1.0f, 0.0f);
647 reskew_ =
FCOORD(1.0f, 0.0f);
650 for (
int i = 0; i < sub_langs_.size(); ++i)
651 sub_langs_[i]->
Clear();
655 equ_detect_ = detector;
662 for (
int i = 0; i < sub_langs_.size(); ++i) {
663 sub_langs_[i]->ResetAdaptiveClassifierInternal();
670 for (
int i = 0; i < sub_langs_.size(); ++i) {
671 sub_langs_[i]->getDict().ResetDocumentDictionary();
681 for (
int i = 0; i < sub_langs_.size(); ++i) {
682 sub_langs_[i]->unicharset.set_black_and_whitelist(
692 pixDestroy(&cube_binary_);
698 for (
int i = 0; i < sub_langs_.size(); ++i) {
701 static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy));
702 if (pageseg_strategy > max_pageseg_strategy)
703 max_pageseg_strategy = pageseg_strategy;
705 pixDestroy(&sub_langs_[i]->cube_binary_);
706 sub_langs_[i]->cube_binary_ = pixClone(
pix_binary());
707 pixDestroy(&sub_langs_[i]->pix_binary_);
708 sub_langs_[i]->pix_binary_ = pixClone(
pix_binary());
714 if (splitter_.
Split(
true)) {
716 pixDestroy(&pix_binary_);
732 for (
int i = 0; i < sub_langs_.size(); ++i) {
735 static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy));
736 if (ocr_strategy > max_ocr_strategy)
737 max_ocr_strategy = ocr_strategy;
743 bool split_for_ocr = splitter_.
Split(
false);
746 pixDestroy(&pix_binary_);
747 pix_binary_ = pixClone(splitter_.
orig_pix());
752 BLOCK block(
"",
TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
753 pixGetHeight(pix_binary_));
C_BLOB_LIST * blob_list()
get blobs
void extract_edges(Pix *pix, BLOCK *block)
#define STRING_MEMBER(name, val, comment, vec)
void set_use_cjk_fp_model(bool flag)
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
void ResetDocumentDictionary()
void SetBlackAndWhitelist()
bool HasDifferentSplitStrategies() const
void ResetAdaptiveClassifier()
char * tessedit_char_blacklist
#define BOOL_MEMBER(name, val, comment, vec)
char * tessedit_char_unblacklist
#define BOOL_INIT_MEMBER(name, val, comment, vec)
void set_orig_pix(Pix *pix)
int pageseg_devanagari_split_strategy
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
void set_pageseg_split_strategy(SplitStrategy strategy)
void SetLangTesseract(Tesseract *lang_tesseract)
#define INT_MEMBER(name, val, comment, vec)
void set_segmentation_block_list(BLOCK_LIST *block_list)
Assume a single uniform block of text. (Default.)
bool Split(bool split_for_pageseg)
int ocr_devanagari_split_strategy
void SetEquationDetect(EquationDetect *detector)
void set_ocr_split_strategy(SplitStrategy strategy)
#define double_MEMBER(name, val, comment, vec)
void ResetAdaptiveClassifierInternal()
char * tessedit_char_whitelist
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
bool textord_use_cjk_fp_model
void ResetDocumentDictionary()
#define INT_INIT_MEMBER(name, val, comment, vec)