23 static int LeadingUnicharsToChopped(
WERD_RES *word,
int num_unichars) {
25 for (
int i = 0; i < num_unichars; i++)
30 static int TrailingUnicharsToChopped(
WERD_RES *word,
int num_unichars) {
32 for (
int i = 0; i < num_unichars; i++)
46 static void YOutlierPieces(
WERD_RES *word,
int rebuilt_blob_index,
47 int super_y_bottom,
int sub_y_top,
48 ScriptPos *leading_pos,
int *num_leading_outliers,
50 int *num_trailing_outliers) {
53 if (!leading_pos) leading_pos = &sp_unused1;
54 if (!num_leading_outliers) num_leading_outliers = &unused1;
55 if (!trailing_pos) trailing_pos = &sp_unused2;
56 if (!num_trailing_outliers) num_trailing_outliers = &unused2;
58 *num_leading_outliers = *num_trailing_outliers = 0;
61 int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
62 int num_chopped_pieces = word->
best_state[rebuilt_blob_index];
64 int trailing_outliers = 0;
65 for (
int i = 0; i < num_chopped_pieces; i++) {
68 if (box.
bottom() >= super_y_bottom) {
70 }
else if (box.
top() <= sub_y_top) {
74 if (trailing_outliers == i) {
75 *num_leading_outliers = trailing_outliers;
76 *leading_pos = last_pos;
78 trailing_outliers = 0;
80 if (pos == last_pos) {
83 trailing_outliers = 1;
88 *num_trailing_outliers = trailing_outliers;
89 *trailing_pos = last_pos;
107 int num_leading, num_trailing;
109 float leading_certainty, trailing_certainty;
110 float avg_certainty, unlikely_threshold;
114 word, &num_leading, &sp_leading, &leading_certainty,
115 &num_trailing, &sp_trailing, &trailing_certainty,
116 &avg_certainty, &unlikely_threshold);
118 const char *leading_pos = sp_leading ==
SP_SUBSCRIPT ?
"sub" :
"super";
119 const char *trailing_pos = sp_trailing ==
SP_SUBSCRIPT ?
"sub" :
"super";
127 int num_remainder_leading = 0, num_remainder_trailing = 0;
128 if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
133 int last_word_char = num_blobs - 1 - num_trailing;
136 last_char_certainty <= unlikely_threshold) {
138 YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
139 nullptr,
nullptr, &rpos, &num_remainder_trailing);
140 if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
141 if (num_remainder_trailing > 0 &&
142 last_char_certainty < trailing_certainty) {
143 trailing_certainty = last_char_certainty;
146 bool another_blob_available = (num_remainder_trailing == 0) ||
147 num_leading + num_trailing + 1 < num_blobs;
149 if (another_blob_available &&
151 first_char_certainty <= unlikely_threshold) {
153 YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
154 &lpos, &num_remainder_leading,
nullptr,
nullptr);
155 if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
156 if (num_remainder_leading > 0 &&
157 first_char_certainty < leading_certainty) {
158 leading_certainty = first_char_certainty;
164 if (num_leading + num_trailing +
165 num_remainder_leading + num_remainder_trailing == 0) {
170 tprintf(
"Candidate for superscript detection: %s (",
172 if (num_leading || num_remainder_leading) {
173 tprintf(
"%d.%d %s-leading ", num_leading, num_remainder_leading,
176 if (num_trailing || num_remainder_trailing) {
177 tprintf(
"%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
186 tprintf(
" Certainties -- Average: %.2f Unlikely thresh: %.2f ",
187 avg_certainty, unlikely_threshold);
189 tprintf(
"Orig. leading (min): %.2f ", leading_certainty);
191 tprintf(
"Orig. trailing (min): %.2f ", trailing_certainty);
198 int num_chopped_leading =
199 LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
200 int num_chopped_trailing =
201 TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
203 int retry_leading = 0;
204 int retry_trailing = 0;
205 bool is_good =
false;
207 num_chopped_leading, leading_certainty, sp_leading,
208 num_chopped_trailing, trailing_certainty, sp_trailing,
209 word, &is_good, &retry_leading, &retry_trailing);
212 }
else if (retry_leading || retry_trailing) {
213 int retry_chopped_leading =
214 LeadingUnicharsToChopped(revised, retry_leading);
215 int retry_chopped_trailing =
216 TrailingUnicharsToChopped(revised, retry_trailing);
218 retry_chopped_leading, leading_certainty, sp_leading,
219 retry_chopped_trailing, trailing_certainty, sp_trailing,
220 revised, &is_good, &retry_leading, &retry_trailing);
255 int *num_rebuilt_leading,
257 float *leading_certainty,
258 int *num_rebuilt_trailing,
260 float *trailing_certainty,
261 float *avg_certainty,
262 float *unlikely_threshold) {
263 *avg_certainty = *unlikely_threshold = 0.0f;
264 *num_rebuilt_leading = *num_rebuilt_trailing = 0;
265 *leading_certainty = *trailing_certainty = 0.0f;
275 *leading_pos = *trailing_pos =
SP_NORMAL;
276 int leading_outliers = 0;
277 int trailing_outliers = 0;
279 float normal_certainty_total = 0.0f;
280 float worst_normal_certainty = 0.0f;
283 for (
int b = 0; b < num_blobs; ++b) {
286 if (box.
bottom() >= super_y_bottom) {
288 }
else if (box.
top() <= sub_y_top) {
294 if (char_certainty < worst_normal_certainty) {
295 worst_normal_certainty = char_certainty;
298 normal_certainty_total += char_certainty;
300 if (trailing_outliers == b) {
301 leading_outliers = trailing_outliers;
302 *leading_pos = last_pos;
304 trailing_outliers = 0;
306 if (last_pos == pos) {
309 trailing_outliers = 1;
314 *trailing_pos = last_pos;
315 if (num_normal >= 3) {
317 normal_certainty_total -= worst_normal_certainty;
319 if (num_normal > 0) {
320 *avg_certainty = normal_certainty_total / num_normal;
323 if (num_normal == 0 ||
324 (leading_outliers == 0 && trailing_outliers == 0)) {
331 for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
332 *num_rebuilt_leading < leading_outliers;
333 (*num_rebuilt_leading)++) {
335 if (char_certainty > *unlikely_threshold) {
338 if (char_certainty < *leading_certainty) {
339 *leading_certainty = char_certainty;
344 for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
345 *num_rebuilt_trailing < trailing_outliers;
346 (*num_rebuilt_trailing)++) {
347 int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
349 if (char_certainty > *unlikely_threshold) {
352 if (char_certainty < *trailing_certainty) {
353 *trailing_certainty = char_certainty;
384 int num_chopped_leading,
float leading_certainty,
ScriptPos leading_pos,
385 int num_chopped_trailing,
float trailing_certainty,
389 int *retry_rebuild_leading,
int *retry_rebuild_trailing) {
392 *retry_rebuild_leading = *retry_rebuild_trailing = 0;
401 if (num_chopped_leading > 0) {
403 split_word(prefix, num_chopped_leading, &core, &bb0);
408 if (num_chopped_trailing > 0) {
409 int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
423 tprintf(
" recognizing first %d chopped blobs\n", num_chopped_leading);
427 tprintf(
" The leading bits look like %s %s\n",
438 tprintf(
" recognizing middle %d chopped blobs\n",
439 num_chopped - num_chopped_leading - num_chopped_trailing);
448 tprintf(
" recognizing last %d chopped blobs\n", num_chopped_trailing);
452 tprintf(
" The trailing bits look like %s %s\n",
467 retry_rebuild_leading,
nullptr);
471 nullptr, retry_rebuild_trailing);
473 *is_good = good_prefix && good_suffix;
474 if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
497 tprintf(
"%s superscript fix: %s\n", *is_good ?
"ACCEPT" :
"REJECT",
524 float certainty_threshold,
526 int *right_ok)
const {
527 int initial_ok_run_count = 0;
528 int ok_run_count = 0;
529 float worst_certainty = 0.0f;
533 for (
int i = 0; i < wc.
length(); i++) {
537 bool bad_certainty = char_certainty < certainty_threshold;
541 if (choice && fontinfo_table.
size() > 0) {
544 bool font1_is_italic = font_id1 >= 0
547 is_italic = font1_is_italic &&
548 (font_id2 < 0 || fontinfo_table.
get(font_id2).
is_italic());
551 float height_fraction = 1.0f;
553 float normal_height = char_height;
555 int min_bot, max_bot, min_top, max_top;
559 float hi_height = max_top - max_bot;
560 float lo_height = min_top - min_bot;
561 normal_height = (hi_height + lo_height) / 2;
565 height_fraction = char_height / normal_height;
572 tprintf(
" Rejecting: superscript is italic.\n");
575 tprintf(
" Rejecting: punctuation present.\n");
579 tprintf(
" Rejecting: don't believe character %s with certainty %.2f " 580 "which is less than threshold %.2f\n", char_str,
581 char_certainty, certainty_threshold);
584 tprintf(
" Rejecting: character %s seems too small @ %.2f versus " 585 "expected %.2f\n", char_str, char_height, normal_height);
588 if (bad_certainty || bad_height || is_punc || is_italic) {
589 if (ok_run_count == i) {
590 initial_ok_run_count = ok_run_count;
596 if (char_certainty < worst_certainty) {
597 worst_certainty = char_certainty;
600 bool all_ok = ok_run_count == wc.
length();
601 if (all_ok && debug) {
602 tprintf(
" Accept: worst revised certainty is %.2f\n", worst_certainty);
605 if (left_ok) *left_ok = initial_ok_run_count;
606 if (right_ok) *right_ok = ok_run_count;
double subscript_max_y_top
double superscript_bettered_certainty
void ConsumeWordResults(WERD_RES *word)
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
bool get_ispunctuation(UNICHAR_ID unichar_id) const
double superscript_worse_certainty
UnicityTable< FontInfo > & get_fontinfo_table()
const char * string() const
const UNICHARSET * unicharset() const
const FontInfo * fontinfo
const int kBlnBaselineOffset
double superscript_scaledown_ratio
bool SubAndSuperscriptFix(WERD_RES *word_res)
int16_t fontinfo_id() const
double superscript_min_y_bottom
int classify_class_pruner_multiplier
bool flag(WERD_FLAGS mask) const
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
int16_t fontinfo_id2() const
int size() const
Return the size used.
GenericVector< int > best_state
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
UNICHAR_ID unichar_id(int index) const
DLLSYM void tprintf(const char *format,...)
TBOX bounding_box() const
GenericVector< TBLOB * > blobs
int classify_integer_matcher_multiplier
const T & get(int id) const
Return the object from an id.
const char * id_to_unichar(UNICHAR_ID id) const
const char * ScriptPosToString(enum ScriptPos script_pos)
void recog_word_recursive(WERD_RES *word)
const STRING & unichar_string() const
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
void SetAllScriptPositions(tesseract::ScriptPos position)
bool top_bottom_useful() const
WERD_CHOICE * best_choice
BLOB_CHOICE * GetBlobChoice(int index) const