47 static const double kStopperAmbiguityThresholdGain = 8.0;
50 static const double kStopperAmbiguityThresholdOffset = 1.5;
64 static double StopperAmbigThreshold(
double f1,
double f2) {
65 return (f2 - f1) * kStopperAmbiguityThresholdGain -
66 kStopperAmbiguityThresholdOffset;
75 bool merge_similar_words,
76 BLOCK_LIST *the_block_list,
79 BLOCK_IT block_it(the_block_list);
81 for (block_it.mark_cycle_pt();
82 !block_it.cycled_list(); block_it.forward()) {
83 block_res_it.add_to_end(
new BLOCK_RES(merge_similar_words,
96 ROW_IT row_it (the_block->
row_list ());
108 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
109 row_res_it.add_to_end(
new ROW_RES(merge_similar_words, row_it.data()));
130 bool add_next_word =
false;
134 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
135 auto* word_res =
new WERD_RES(word_it.data());
140 word_res->part_of_combo =
true;
142 }
else if (merge_similar_words) {
143 union_box = word_res->word->bounding_box();
144 add_next_word = !word_res->word->flag(
W_REP_CHAR) &&
146 word_res->odd_size = !add_next_word;
148 WERD* next_word = word_it.data_relative(1);
149 if (merge_similar_words) {
157 int prev_right = union_box.
right();
158 union_box += next_box;
162 add_next_word =
false;
170 if (combo ==
nullptr) {
171 copy_word =
new WERD;
172 *copy_word = *(word_it.data());
176 word_res_it.add_to_end(combo);
178 word_res->part_of_combo =
true;
182 word_res_it.add_to_end(word_res);
215 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.
best_choices));
217 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
219 wc_dest_it.add_after_then_move(
new WERD_CHOICE(*choice));
221 if (!wc_dest_it.empty()) {
222 wc_dest_it.move_to_first();
304 const TBOX* norm_box,
307 bool allow_detailed_fx,
309 auto norm_mode_hint =
310 static_cast<tesseract::OcrEngineMode>(norm_mode);
315 (pb !=
nullptr && !pb->
IsText())) {
325 float word_xheight = use_body_size && row !=
nullptr && row->
body_size() > 0.0f
329 norm_mode_hint, norm_box, &
denorm);
359 if (blob_count > 0) {
360 auto** fake_choices =
new BLOB_CHOICE*[blob_count];
365 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
366 TBOX box = b_it.data()->bounding_box();
371 delete [] fake_choices;
403 for (
int b = 0; b < num_blobs; ++b) {
407 if (b + 1 < num_blobs) {
428 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
440 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&
best_choices));
441 for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
460 tprintf(
"raw_choice has total of states = %d vs ratings dim of %d\n",
466 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
469 tprintf(
"Cooked #%d has total of states = %d vs ratings dim of %d\n",
481 (word_to_debug !=
nullptr && *word_to_debug !=
'\0' &&
best_choice !=
nullptr &&
488 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
499 tprintf(
"Best choice: accepted=%d, adaptable=%d, done=%d : ",
516 if (debug_level >= 2)
520 for (it.forward(); !it.at_first(); it.forward(), ++index) {
529 int i = 0, j = 0, chunk = 0;
535 while (i < choice->length() && j < best_choice->length()) {
538 if (debug_level >= 2) {
539 choice->
print(
"WorstCertaintyDiffWorseThan");
541 "i %d j %d Choice->Blob[i].Certainty %.4g"
542 " WorstOtherChoiceCertainty %g Threshold %g\n",
544 tprintf(
"Discarding bad choice #%d\n", index);
551 while (choice_chunk < chunk && ++i < choice->length())
552 choice_chunk += choice->
state(i);
554 while (best_chunk < chunk && ++j < best_choice->length())
570 float avg_rating = 0.0f;
571 int num_error_chunks = 0;
574 while (chunk < end_chunk) {
575 if (chunk >= end_raw_chunk) {
587 if (num_error_chunks > 0) {
588 avg_rating /= num_error_chunks;
589 *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
591 *thresholds = max_rating;
594 if (*thresholds > max_rating)
595 *thresholds = max_rating;
596 if (*thresholds < min_rating)
597 *thresholds = min_rating;
627 float max_certainty_delta =
630 if (max_certainty_delta > -kStopperAmbiguityThresholdOffset)
631 max_certainty_delta = -kStopperAmbiguityThresholdOffset;
633 max_certainty_delta) {
637 tprintf(
"Discarding choice \"%s\" with an overly low certainty"
638 " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
652 bool inserted =
false;
657 if (choice->
rating() > word_choice->
rating() && !inserted) {
659 it.add_before_stay_put(word_choice);
661 if (num_choices == 0)
672 tprintf(
"Discarding duplicate choice \"%s\", rating %g vs %g\n",
680 if (num_choices > max_num_choices)
684 }
while (!it.at_first());
686 if (!inserted && num_choices < max_num_choices) {
687 it.add_to_end(word_choice);
689 if (num_choices == 0)
697 word_choice->
print(
" Word Choice");
709 template<
class T>
static void MovePointerData(T**
dest, T**src) {
718 WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&
best_choices));
719 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
720 if (!it.at_first()) alternates_str +=
"\", \"";
721 alternates_str += it.data()->unichar_string();
723 tprintf(
"Alternates for \"%s\": {\"%s\"}\n",
731 for (
int b = start_blob; b <= last_blob; ++b) {
772 word->seam_array.clear();
783 wc_it.add_list_after(&
word->best_choices);
785 if (
word->blamer_bundle !=
nullptr) {
841 for (
int i = 0; i < word_len; ++i) {
867 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward())
868 wc_it.data()->SetAllScriptPositions(position);
883 for (
int c = 0; c < blob_count; ++c) {
884 auto* choice_list =
new BLOB_CHOICE_LIST;
885 BLOB_CHOICE_IT choice_it(choice_list);
886 choice_it.add_after_then_move(choices[c]);
900 word_choice->set_permuter(permuter);
901 for (
int b = 0; b < num_blobs; ++b) {
905 float certainty = -FLT_MAX;
906 BLOB_CHOICE_LIST* choices =
ratings->
get(b, b);
907 if (choices !=
nullptr && !choices->empty()) {
908 BLOB_CHOICE_IT bc_it(choices);
911 rating = choice->
rating();
914 word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
940 std::function<
bool(
const TBOX&,
const TBOX&)> box_cb) {
942 bool modified =
false;
946 if (new_id != INVALID_UNICHAR_ID &&
962 BLOB_CHOICE_IT bc_it(blob_choices);
963 bc_it.add_before_then_move(blob_choice);
990 static int is_simple_quote(
const char* signed_str,
int length) {
992 reinterpret_cast<const unsigned char*>(signed_str);
994 return (length == 1 && (*str ==
'\'' || *str ==
'`')) ||
996 (length == 3 && ((*str == 0xe2 &&
997 *(str + 1) == 0x80 &&
998 *(str + 2) == 0x98) ||
1000 *(str + 1) == 0x80 &&
1001 *(str + 2) == 0x99)));
1009 if (is_simple_quote(ch, strlen(ch)) &&
1010 is_simple_quote(next_ch, strlen(next_ch)))
1012 return INVALID_UNICHAR_ID;
1021 using namespace std::placeholders;
1031 if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
1032 (*ch ==
'-' || *ch ==
'~') && (*next_ch ==
'-' || *next_ch ==
'~'))
1034 return INVALID_UNICHAR_ID;
1050 using namespace std::placeholders;
1061 return INVALID_UNICHAR_ID;
1066 using namespace std::placeholders;
1068 this, _1, _2),
nullptr)) {
1079 for (
int index = start; index < start +
count - 1; ++index) {
1082 if (seam !=
nullptr && seam->
HasAnySplits())
return false;
1146 if (other.block_res ==
nullptr) {
1148 if (block_res ==
nullptr)
1152 if (block_res ==
nullptr) {
1155 if (block_res == other.block_res) {
1156 if (other.row_res ==
nullptr || row_res ==
nullptr) {
1160 if (row_res == other.row_res) {
1162 ASSERT_HOST(other.word_res !=
nullptr && word_res !=
nullptr);
1163 if (word_res == other.word_res) {
1168 WERD_RES_IT word_res_it(&row_res->word_res_list);
1169 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1170 word_res_it.forward()) {
1171 if (word_res_it.data() == word_res) {
1173 }
else if (word_res_it.data() == other.word_res) {
1177 ASSERT_HOST(
"Error: Incomparable PAGE_RES_ITs" ==
nullptr);
1181 ROW_RES_IT row_res_it(&block_res->row_res_list);
1182 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1183 row_res_it.forward()) {
1184 if (row_res_it.data() == row_res) {
1186 }
else if (row_res_it.data() == other.row_res) {
1190 ASSERT_HOST(
"Error: Incomparable PAGE_RES_ITs" ==
nullptr);
1194 BLOCK_RES_IT block_res_it(&page_res->block_res_list);
1195 for (block_res_it.mark_cycle_pt();
1196 !block_res_it.cycled_list(); block_res_it.forward()) {
1197 if (block_res_it.data() == block_res) {
1199 }
else if (block_res_it.data() == other.block_res) {
1204 ASSERT_HOST(
"Error: Incomparable PAGE_RES_ITs" ==
nullptr);
1215 auto* new_res =
new WERD_RES(new_word);
1216 new_res->CopySimpleFields(clone_res);
1217 new_res->combination =
true;
1219 WERD_RES_IT wr_it(&row()->word_res_list);
1220 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1222 if (
word == word_res)
1226 wr_it.add_before_then_move(new_res);
1227 if (wr_it.at_first()) {
1230 ResetWordIterator();
1239 C_BLOB_LIST* next_word_blobs,
1242 for (
int i = 0; i <
word.best_state.size(); ++i) {
1243 int length =
word.best_state[i];
1245 TBOX blob_box = blob_it.data()->bounding_box();
1247 for (
int b = 1; b < length; ++b) {
1248 blob_box += blob_it.data()->bounding_box();
1253 int blob_end = INT32_MAX;
1254 if (!blob_it.at_first() || next_word_blobs !=
nullptr) {
1255 if (blob_it.at_first())
1256 blob_it.set_to_list(next_word_blobs);
1257 blob_end = (blob_box.
right() + blob_it.data()->bounding_box().left()) / 2;
1259 blob_end = ClipToRange<int>(blob_end, clip_box.
left(), clip_box.
right());
1268 int w_index,
TBOX prev_box, WERD_RES_IT w_it) {
1269 constexpr
int kSignificantOverlapFraction = 4;
1271 TBOX current_box = words[w_index]->word->bounding_box();
1273 if (w_index + 1 < words.
size() && words[w_index + 1] !=
nullptr &&
1274 words[w_index + 1]->word !=
nullptr)
1276 for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo;
1278 if (w_it.data() ==
nullptr || w_it.data()->word ==
nullptr)
continue;
1279 TBOX w_box = w_it.data()->word->bounding_box();
1280 int height_limit = std::min<int>(w_box.
height(), w_box.
width() / 2);
1281 int width_limit = w_box.
width() / kSignificantOverlapFraction;
1282 int min_significant_overlap = std::max(height_limit, width_limit);
1286 if (overlap > min_significant_overlap) {
1287 if (prev_overlap > min_significant_overlap) {
1290 }
else if (next_overlap > min_significant_overlap) {
1294 clipped_box += w_box;
1298 if (clipped_box.
height() <= 0) {
1302 if (clipped_box.
width() <= 0) clipped_box = current_box;
1308 static TBOX MoveAndClipBlob(C_BLOB_IT* src_it, C_BLOB_IT* dest_it,
1309 const TBOX& clip_box) {
1310 C_BLOB* src_blob = src_it->extract();
1314 ClipToRange<int>(box.
left(), clip_box.
left(), clip_box.
right() - 1);
1316 ClipToRange<int>(box.
right(), clip_box.
left() + 1, clip_box.
right());
1318 ClipToRange<int>(box.
top(), clip_box.
bottom() + 1, clip_box.
top());
1320 ClipToRange<int>(box.
bottom(), clip_box.
bottom(), clip_box.
top() - 1);
1321 box =
TBOX(left, bottom, right, top);
1325 dest_it->add_after_then_move(src_blob);
1334 if (words->
empty()) {
1335 DeleteCurrentWord();
1341 (*words)[0]->word->set_flag(
W_BOL,
true);
1343 (*words)[0]->word->set_blanks(input_word->
word->
space());
1353 WERD_IT w_it(row()->row->word_list());
1355 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1364 WERD_RES_IT wr_it(&row()->word_res_list);
1365 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1367 if (
word == input_word)
1378 for (
int w = 0; w < words->
size(); ++w) {
1380 clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
1383 C_BLOB_LIST* next_word_blobs =
1384 w + 1 < words->
size() ? (*words)[w + 1]->word->cblob_list() :
nullptr;
1385 ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
1388 C_BLOB_LIST fake_blobs;
1389 C_BLOB_IT fake_b_it(&fake_blobs);
1391 fake_b_it.move_to_first();
1396 for (
int i = 0; i < blob_ends.
size(); ++i, fake_b_it.forward()) {
1397 int end_x = blob_ends[i];
1400 while (!src_b_it.empty() &&
1401 src_b_it.data()->bounding_box().x_middle() < end_x) {
1402 blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
1405 while (!rej_b_it.empty() &&
1406 rej_b_it.data()->bounding_box().x_middle() < end_x) {
1407 blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
1412 blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
1421 w_it.add_before_stay_put(word_w->
word);
1424 (*words)[w] =
nullptr;
1425 wr_it.add_before_stay_put(word_w);
1433 delete w_it.extract();
1434 delete wr_it.extract();
1435 ResetWordIterator();
1443 if (!word_res->combination) {
1447 WERD_IT w_it(row()->row->word_list());
1448 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1449 if (w_it.data() == word_res->word) {
1454 delete w_it.extract();
1458 WERD_RES_IT wr_it(&row()->word_res_list);
1459 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1460 if (wr_it.data() == word_res) {
1466 delete wr_it.extract();
1467 ResetWordIterator();
1473 WERD* real_word = word_res->word;
1476 if (word_res->combination) {
1479 WERD_RES_IT wr_it(&row()->word_res_list);
1480 for (wr_it.mark_cycle_pt();
1481 !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1485 real_word = wr_it.data()->word;
1500 block_res_it.set_to_list(&page_res->block_res_list);
1501 block_res_it.mark_cycle_pt();
1502 prev_block_res =
nullptr;
1503 prev_row_res =
nullptr;
1504 prev_word_res =
nullptr;
1505 block_res =
nullptr;
1508 next_block_res =
nullptr;
1509 next_row_res =
nullptr;
1510 next_word_res =
nullptr;
1511 internal_forward(
true, empty_ok);
1512 return internal_forward(
false, empty_ok);
1523 if (row_res == next_row_res) {
1526 word_res_it.move_to_first();
1527 for (word_res_it.mark_cycle_pt();
1528 !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1529 word_res_it.forward()) {
1530 if (!word_res_it.data()->part_of_combo) {
1531 if (prev_row_res == row_res) prev_word_res = word_res;
1532 word_res = word_res_it.data();
1536 wr_it_of_next_word = word_res_it;
1537 word_res_it.forward();
1540 WERD_RES_IT wr_it(&row_res->word_res_list);
1541 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1542 if (!wr_it.data()->part_of_combo) {
1543 if (prev_row_res == row_res) prev_word_res = word_res;
1544 word_res = wr_it.data();
1565 WERD_RES *PAGE_RES_IT::internal_forward(
bool new_block,
bool empty_ok) {
1566 bool new_row =
false;
1568 prev_block_res = block_res;
1569 prev_row_res = row_res;
1570 prev_word_res = word_res;
1571 block_res = next_block_res;
1572 row_res = next_row_res;
1573 word_res = next_word_res;
1574 wr_it_of_current_word = wr_it_of_next_word;
1575 next_block_res =
nullptr;
1576 next_row_res =
nullptr;
1577 next_word_res =
nullptr;
1579 while (!block_res_it.cycled_list()) {
1582 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
1583 row_res_it.mark_cycle_pt();
1584 if (row_res_it.empty() && empty_ok) {
1585 next_block_res = block_res_it.data();
1590 while (!row_res_it.cycled_list()) {
1593 word_res_it.set_to_list(&row_res_it.data()->word_res_list);
1594 word_res_it.mark_cycle_pt();
1597 while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
1598 word_res_it.forward();
1599 if (!word_res_it.cycled_list()) {
1600 next_block_res = block_res_it.data();
1601 next_row_res = row_res_it.data();
1602 next_word_res = word_res_it.data();
1603 wr_it_of_next_word = word_res_it;
1604 word_res_it.forward();
1608 row_res_it.forward();
1612 block_res_it.forward();
1617 if (page_res !=
nullptr && page_res->prev_word_best_choice !=
nullptr) {
1618 *page_res->prev_word_best_choice =
1619 (new_block || prev_word_res ==
nullptr) ?
nullptr : prev_word_res->
best_choice;
1631 if (!row)
return nullptr;
1632 for (restart_page(); this->row() != row; forward()) {
1645 while (block_res == next_block_res &&
1646 (next_row_res !=
nullptr && next_row_res->row !=
nullptr &&
1647 row_res->row->para() == next_row_res->row->para())) {
1648 internal_forward(
false,
true);
1650 return internal_forward(
false,
true);
1660 while (block_res == next_block_res) {
1661 internal_forward(
false,
true);
1663 return internal_forward(
false,
true);
1667 int16_t chars_in_word;
1668 int16_t rejects_in_word = 0;
1670 chars_in_word = word_res->reject_map.length ();
1671 page_res->char_count += chars_in_word;
1672 block_res->char_count += chars_in_word;
1673 row_res->char_count += chars_in_word;
1675 rejects_in_word = word_res->reject_map.reject_count ();
1677 page_res->rej_count += rejects_in_word;
1678 block_res->rej_count += rejects_in_word;
1679 row_res->rej_count += rejects_in_word;
1680 if (chars_in_word == rejects_in_word)
1681 row_res->whole_word_rej_count += rejects_in_word;