21 #include "config_auto.h"
82 : left_margin_(-INT32_MAX), right_margin_(INT32_MAX),
83 median_bottom_(INT32_MAX), median_top_(-INT32_MAX),
84 median_left_(INT32_MAX), median_right_(-INT32_MAX),
85 blob_type_(blob_type),
87 memset(special_blobs_densities_, 0,
sizeof(special_blobs_densities_));
117 ColPartition_LIST* big_part_list) {
126 if (big_part_list !=
nullptr) {
127 ColPartition_IT part_it(big_part_list);
128 part_it.add_to_end(single);
136 ColPartition_C_IT it(&upper_partners_);
137 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
138 it.data()->RemovePartner(
false,
this);
140 it.set_to_list(&lower_partners_);
141 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
142 it.data()->RemovePartner(
true,
this);
150 int left,
int bottom,
151 int right,
int top) {
153 part->bounding_box_ =
TBOX(left, bottom, right, top);
154 part->median_bottom_ = bottom;
155 part->median_top_ = top;
156 part->median_height_ = top - bottom;
157 part->median_left_ = left;
158 part->median_right_ = right;
159 part->median_width_ = right - left;
160 part->left_key_ = part->BoxLeftKey();
161 part->right_key_ = part->BoxRightKey();
172 if (boxes_.length() == 0) {
175 bounding_box_ += box;
179 if (!last_add_was_vertical_) {
180 boxes_.sort(SortByBoxBottom<BLOBNBOX>);
181 last_add_was_vertical_ =
true;
183 boxes_.add_sorted(SortByBoxBottom<BLOBNBOX>,
true, bbox);
185 if (last_add_was_vertical_) {
186 boxes_.sort(SortByBoxLeft<BLOBNBOX>);
187 last_add_was_vertical_ =
false;
189 boxes_.add_sorted(SortByBoxLeft<BLOBNBOX>,
true, bbox);
196 tprintf(
"Added box (%d,%d)->(%d,%d) left_blob_x_=%d, right_blob_x_ = %d\n",
198 bounding_box_.
left(), bounding_box_.
right());
203 BLOBNBOX_C_IT bb_it(&boxes_);
204 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
205 if (box == bb_it.data()) {
217 BLOBNBOX_C_IT bb_it(&boxes_);
218 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
221 if (biggest ==
nullptr ||
225 if (biggest ==
nullptr ||
236 BLOBNBOX_C_IT bb_it(&boxes_);
237 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
238 if (box != bb_it.data()) {
239 result += bb_it.data()->bounding_box();
248 BLOBNBOX_C_IT bb_it(&boxes_);
249 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
252 if (other ==
nullptr) {
264 BLOBNBOX_C_IT bb_it(&boxes_);
265 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
277 BLOBNBOX_C_IT bb_it(&boxes_);
278 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
280 if (bblob->
owner() ==
this)
290 BLOBNBOX_C_IT bb_it(&boxes_);
291 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
298 if (bb_it.empty())
return false;
309 for (BLOBNBOX_C_IT bb_it(&boxes_); !bb_it.empty(); bb_it.forward()) {
311 delete bblob->
cblob();
321 BLOBNBOX_CLIST reversed_boxes;
322 BLOBNBOX_C_IT reversed_it(&reversed_boxes);
324 BLOBNBOX_C_IT bb_it(&boxes_);
325 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
326 reversed_it.add_before_then_move(bb_it.extract());
328 bb_it.add_list_after(&reversed_boxes);
330 int tmp = left_margin_;
331 left_margin_ = -right_margin_;
332 right_margin_ = -tmp;
343 if (bounding_box_.
left() > bounding_box_.
right()) {
345 tprintf(
"Bounding box invalid\n");
350 if (left_margin_ > bounding_box_.
left() ||
351 right_margin_ < bounding_box_.
right()) {
360 tprintf(
"Key inside box: %d v %d or %d v %d\n",
371 int y = (
MidY() + other.
MidY()) / 2;
414 if (bounding_box_.
right() < other.bounding_box_.
left() &&
417 if (other.bounding_box_.
right() < bounding_box_.
left() &&
420 if (bounding_box_.
left() > other.bounding_box_.
right() &&
423 if (other.bounding_box_.
left() > bounding_box_.
right() &&
431 double fractional_tolerance,
432 double constant_tolerance)
const {
434 int nonmatch_count = 0;
435 BLOBNBOX_C_IT box_it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
436 BLOBNBOX_C_IT other_it(const_cast<BLOBNBOX_CLIST*>(&other.boxes_));
437 box_it.mark_cycle_pt();
438 other_it.mark_cycle_pt();
439 while (!box_it.cycled_list() && !other_it.cycled_list()) {
440 if (box_it.data()->MatchingStrokeWidth(*other_it.data(),
441 fractional_tolerance,
449 return match_count > nonmatch_count;
460 BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
461 int min_top = INT32_MAX;
462 int max_bottom = -INT32_MAX;
463 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
467 tprintf(
"Blob is not a diacritic:");
479 bool result = min_top > candidate.median_bottom_ &&
480 max_bottom < candidate.median_top_;
485 tprintf(
"y ranges don\'t overlap: %d-%d / %d-%d\n",
486 max_bottom, min_top, median_bottom_, median_top_);
495 if (tab_vector !=
nullptr) {
499 left_key_tab_ =
false;
507 if (tab_vector !=
nullptr) {
508 right_key_ = tab_vector->
sort_key();
511 right_key_tab_ =
false;
520 left_key_tab_ = take_box ? false : src.left_key_tab_;
522 left_key_ = src.left_key_;
527 if (left_margin_ > bounding_box_.
left())
528 left_margin_ = src.left_margin_;
533 right_key_tab_ = take_box ? false : src.right_key_tab_;
534 if (right_key_tab_) {
535 right_key_ = src.right_key_;
540 if (right_margin_ < bounding_box_.
right())
541 right_margin_ = src.right_margin_;
546 BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
547 return it.data()->left_rule();
551 BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
553 return it.data()->right_rule();
558 return special_blobs_densities_[
type];
563 BLOBNBOX_C_IT blob_it(&boxes_);
565 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
579 special_blobs_densities_[
type] = density;
583 memset(special_blobs_densities_, 0,
sizeof(special_blobs_densities_));
584 if (boxes_.empty()) {
588 BLOBNBOX_C_IT blob_it(&boxes_);
589 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
592 special_blobs_densities_[
type]++;
595 for (
float& special_blobs_density : special_blobs_densities_) {
596 special_blobs_density /= boxes_.length();
605 partner->lower_partners_.add_sorted(SortByBoxLeft<ColPartition>,
607 upper_partners_.add_sorted(SortByBoxLeft<ColPartition>,
true, partner);
609 partner->upper_partners_.add_sorted(SortByBoxLeft<ColPartition>,
611 lower_partners_.add_sorted(SortByBoxLeft<ColPartition>,
true, partner);
619 ColPartition_C_IT it(upper ? &upper_partners_ : &lower_partners_);
620 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
621 if (it.data() == partner) {
630 ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
631 if (!partners->singleton())
633 ColPartition_C_IT it(partners);
645 bounding_box_.
bottom()) ||
647 other->bounding_box_.
bottom())) {
654 memset(special_blobs_densities_, 0,
sizeof(special_blobs_densities_));
656 unsigned w1 = boxes_.length();
657 unsigned w2 = other->boxes_.length();
658 float new_val = special_blobs_densities_[
type] * w1 +
659 other->special_blobs_densities_[
type] * w2;
662 special_blobs_densities_[
type] = new_val / (w1 + w2);
667 BLOBNBOX_C_IT it(&boxes_);
668 BLOBNBOX_C_IT it2(&other->boxes_);
669 for (; !it2.empty(); it2.forward()) {
672 if (prev_owner != other && prev_owner !=
nullptr) {
676 ASSERT_HOST(prev_owner == other || prev_owner ==
nullptr);
677 if (prev_owner == other)
679 it.add_to_end(bbox2);
681 left_margin_ = std::min(left_margin_, other->left_margin_);
682 right_margin_ = std::max(right_margin_, other->right_margin_);
683 if (other->left_key_ < left_key_) {
684 left_key_ = other->left_key_;
685 left_key_tab_ = other->left_key_tab_;
687 if (other->right_key_ > right_key_) {
688 right_key_ = other->right_key_;
689 right_key_tab_ = other->right_key_tab_;
694 flow_ = other->flow_;
695 blob_type_ = other->blob_type_;
699 boxes_.sort(SortByBoxBottom<BLOBNBOX>);
700 last_add_was_vertical_ =
true;
702 boxes_.sort(SortByBoxLeft<BLOBNBOX>);
703 last_add_was_vertical_ =
false;
708 for (
int upper = 0; upper < 2; ++upper) {
709 ColPartition_CLIST partners;
710 ColPartition_C_IT part_it(&partners);
711 part_it.add_list_after(upper ? &other->upper_partners_
712 : &other->lower_partners_);
713 for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
738 int ok_box_overlap,
bool debug) {
742 tprintf(
"Vertical partition\n");
756 if (merged_box.bottom() < median_top_ && merged_box.top() > median_bottom_ &&
757 merged_box.bottom() < bounding_box_.
top() - ok_box_overlap &&
758 merged_box.top() > bounding_box_.
bottom() + ok_box_overlap) {
760 tprintf(
"Excessive box overlap\n");
770 if (boxes_.empty() || boxes_.singleton())
772 BLOBNBOX_C_IT it(&boxes_);
773 TBOX left_box(it.data()->bounding_box());
774 for (it.forward(); !it.at_first(); it.forward()) {
777 if (left_box.overlap(box))
790 BLOBNBOX_C_IT it(&boxes_);
791 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
795 if (bbox == split_blob || !split_part->boxes_.empty()) {
796 split_part->
AddBox(it.extract());
808 right_key_tab_ =
false;
809 split_part->left_key_tab_ =
false;
824 if (split_x <= bounding_box_.
left() || split_x >= bounding_box_.
right())
828 BLOBNBOX_C_IT it(&boxes_);
829 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
834 if (box.
left() >= split_x) {
835 split_part->
AddBox(it.extract());
842 it.add_list_after(&split_part->boxes_);
851 right_key_tab_ =
false;
852 split_part->left_key_tab_ =
false;
853 right_margin_ = split_x;
854 split_part->left_margin_ = split_x;
862 bounding_box_ =
TBOX();
863 BLOBNBOX_C_IT it(&boxes_);
865 int non_leader_count = 0;
867 bounding_box_.
set_left(left_margin_);
872 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
884 tprintf(
"Computed left-illegal partition\n");
890 tprintf(
"Computed right-illegal partition\n");
897 median_top_ = bounding_box_.
top();
898 median_bottom_ = bounding_box_.
bottom();
899 median_height_ = bounding_box_.
height();
900 median_left_ = bounding_box_.
left();
901 median_right_ = bounding_box_.
right();
902 median_width_ = bounding_box_.
width();
905 STATS bottom_stats(bounding_box_.
bottom(), bounding_box_.
top() + 1);
907 STATS left_stats(bounding_box_.
left(), bounding_box_.
right() + 1);
908 STATS right_stats(bounding_box_.
left(), bounding_box_.
right() + 1);
909 STATS width_stats(0, bounding_box_.
width() + 1);
910 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
914 int area = box.
area();
915 top_stats.add(box.
top(), area);
916 bottom_stats.add(box.
bottom(), area);
917 height_stats.add(box.
height(), area);
918 left_stats.add(box.
left(), area);
919 right_stats.add(box.
right(), area);
920 width_stats.add(box.
width(), area);
923 median_top_ = static_cast<int>(top_stats.median() + 0.5);
924 median_bottom_ = static_cast<int>(bottom_stats.median() + 0.5);
925 median_height_ = static_cast<int>(height_stats.median() + 0.5);
926 median_left_ = static_cast<int>(left_stats.median() + 0.5);
927 median_right_ = static_cast<int>(right_stats.median() + 0.5);
928 median_width_ = static_cast<int>(width_stats.median() + 0.5);
932 tprintf(
"Made partition with bad right coords");
936 tprintf(
"Made partition with bad left coords");
942 for (
int upper = 0; upper < 2; ++upper) {
943 ColPartition_CLIST partners;
944 ColPartition_C_IT part_it(&partners);
945 part_it.add_list_after(upper ? &upper_partners_ : &lower_partners_);
946 for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
953 bounding_box_.
bottom())) {
954 tprintf(
"Recomputed box for partition %p\n",
this);
961 BLOBNBOX_C_IT it(&boxes_);
962 int overlap_count = 0;
963 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
968 return overlap_count;
974 int first_spanned_col = -1;
977 bounding_box_.
left(), bounding_box_.
right(),
978 std::min(bounding_box_.
height(), bounding_box_.
width()),
979 MidY(), left_margin_, right_margin_,
980 &first_column_, &last_column_,
982 column_set_ = columns;
983 if (first_column_ < last_column_ && span_type ==
CST_PULLOUT &&
987 if (first_spanned_col >= 0) {
988 first_column_ = first_spanned_col;
989 last_column_ = first_spanned_col;
991 if ((first_column_ & 1) == 0)
992 last_column_ = first_column_;
993 else if ((last_column_ & 1) == 0)
994 first_column_ = last_column_;
996 first_column_ = last_column_ = (first_column_ + last_column_) / 2;
1014 switch (blob_type_) {
1057 int* first_col,
int* last_col) {
1058 int first_spanned_col = -1;
1061 bounding_box_.
left(), bounding_box_.
right(),
1062 std::min(bounding_box_.
height(), bounding_box_.
width()),
1063 MidY(), left_margin_, right_margin_,
1064 first_col, last_col,
1065 &first_spanned_col);
1073 good_width_ = cb(width);
1074 good_column_ = blob_type_ ==
BRT_TEXT && left_key_tab_ && right_key_tab_;
1084 bool result =
false;
1086 int part_width = bounding_box_.
width();
1087 STATS gap_stats(0, part_width);
1088 STATS width_stats(0, part_width);
1089 BLOBNBOX_C_IT it(&boxes_);
1094 for (it.forward(); !it.at_first(); it.forward()) {
1099 width_stats.
add(right - left, 1);
1104 double median_gap = gap_stats.
median();
1108 double gap_iqr = gap_stats.
ile(0.75f) - gap_stats.
ile(0.25f);
1110 tprintf(
"gap iqr = %g, blob_count=%d, limits=%g,%g\n",
1120 int offset = static_cast<int>(ceil(gap_iqr * 2));
1121 int min_step = static_cast<int>(median_gap +
median_width + 0.5);
1122 int max_step = min_step + offset;
1125 int part_left = bounding_box_.
left() - min_step / 2;
1126 part_width += min_step;
1127 auto* projection =
new DPPoint[part_width];
1128 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1133 for (
int x = left; x < right; ++x) {
1134 projection[left - part_left].AddLocalCost(height);
1139 part_width, projection);
1140 if (best_end !=
nullptr && best_end->
total_cost() < blob_count) {
1143 bool modified_blob_list =
false;
1144 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1147 if (it.at_first()) {
1152 modified_blob_list =
true;
1158 it.data_relative(-1)->bounding_box().right();
1161 modified_blob_list =
true;
1172 if (best_end ==
nullptr) {
1179 delete [] projection;
1193 int good_blob_score_ = 0;
1194 int noisy_count = 0;
1195 int hline_count = 0;
1196 int vline_count = 0;
1197 BLOBNBOX_C_IT it(&boxes_);
1198 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1208 if (hline_count > vline_count) {
1211 }
else if (vline_count > hline_count) {
1214 }
else if (value < -1 || 1 < value) {
1218 long_side = bounding_box_.
width();
1219 short_side = bounding_box_.
height();
1222 long_side = bounding_box_.
height();
1223 short_side = bounding_box_.
width();
1239 if (flow_ ==
BTFT_CHAIN && strong_score == 3)
1247 if (noisy_count >= blob_count) {
1253 bounding_box_.
bottom())) {
1254 tprintf(
"RegionFlowTypesFromProjectionValue count=%d, noisy=%d, score=%d,",
1255 blob_count, noisy_count, good_blob_score_);
1256 tprintf(
" Projection value=%d, flow=%d, blob_type=%d\n",
1257 value, flow_, blob_type_);
1268 BLOBNBOX_C_IT it(&boxes_);
1269 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1286 int total_height = 0;
1288 int height_count = 0;
1290 BLOBNBOX_C_IT it(&boxes_);
1291 TBOX box(it.data()->bounding_box());
1297 ICOORD first_pt(box.right(), box.bottom());
1300 linepoints.
Add(first_pt);
1301 for (it.forward(); !it.at_last(); it.forward()) {
1304 ICOORD box_pt(box.right(), (box.top() + box.bottom()) / 2);
1305 linepoints.
Add(box_pt);
1306 total_height += box.width();
1307 coverage += box.height();
1310 box = it.data()->bounding_box();
1311 ICOORD last_pt(box.right(), box.top());
1312 linepoints.
Add(last_pt);
1313 width = last_pt.y() - first_pt.y();
1317 TBOX box(it.data()->bounding_box());
1320 ICOORD first_pt(box.left(), box.bottom());
1321 linepoints.
Add(first_pt);
1322 for (it.forward(); !it.at_last(); it.forward()) {
1325 ICOORD box_pt((box.left() + box.right()) / 2, box.bottom());
1326 linepoints.
Add(box_pt);
1327 total_height += box.height();
1328 coverage += box.width();
1331 box = it.data()->bounding_box();
1332 ICOORD last_pt(box.right(), box.bottom());
1333 linepoints.
Add(last_pt);
1334 width = last_pt.x() - first_pt.x();
1337 if (height_count == 0)
1341 double error = linepoints.
Fit(&start_pt, &end_pt);
1349 ColPartition_LIST* used_parts,
1350 WorkingPartSet_LIST* working_sets) {
1353 block_owned_ =
true;
1354 WorkingPartSet_IT it(working_sets);
1357 if (partner !=
nullptr && partner->working_set_ !=
nullptr) {
1358 working_set_ = partner->working_set_;
1363 tprintf(
"Partition with partner has no working set!:");
1371 for (it.mark_cycle_pt(); !it.cycled_list() &&
1372 col_index != first_column_;
1373 it.forward(), ++col_index);
1375 tprintf(
"Match is %s for:", (col_index & 1) ?
"Real" :
"Between");
1379 tprintf(
"Target column=%d, only had %d\n", first_column_, col_index);
1382 work_set = it.data();
1385 if (!it.cycled_list() && last_column_ != first_column_ && !
IsPulloutType()) {
1387 BLOCK_LIST completed_blocks;
1388 TO_BLOCK_LIST to_blocks;
1389 for (; !it.cycled_list() && col_index <= last_column_;
1390 it.forward(), ++col_index) {
1393 &completed_blocks, &to_blocks);
1395 work_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);
1397 working_set_ = work_set;
1409 ColPartition_LIST* block_parts,
1410 ColPartition_LIST* used_parts,
1411 BLOCK_LIST* completed_blocks,
1412 TO_BLOCK_LIST* to_blocks) {
1413 int page_height = tright.
y() - bleft.
y();
1415 ColPartition_IT it(block_parts);
1417 int max_line_height = 0;
1423 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1429 BLOBNBOX_C_IT blob_it(part->
boxes());
1430 int prev_bottom = blob_it.data()->bounding_box().bottom();
1431 for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1434 int step = bottom - prev_bottom;
1437 side_steps.add(step, 1);
1438 prev_bottom = bottom;
1440 part->
set_side_step(static_cast<int>(side_steps.median() + 0.5));
1441 if (!it.at_last()) {
1452 tprintf(
"side step = %.2f, top spacing = %d, bottom spacing=%d\n",
1457 if (part_count == 0)
1460 SmoothSpacings(resolution, page_height, block_parts);
1463 BLOCK_IT block_it(completed_blocks);
1464 TO_BLOCK_IT to_block_it(to_blocks);
1465 ColPartition_LIST spacing_parts;
1466 ColPartition_IT sp_block_it(&spacing_parts);
1468 for (it.mark_cycle_pt(); !it.empty();) {
1470 sp_block_it.add_to_end(part);
1472 if (it.empty() || part->
bottom_spacing() > same_block_threshold ||
1473 !part->SpacingsEqual(*it.data(), resolution)) {
1476 if (!it.empty() && part->
bottom_spacing() <= same_block_threshold) {
1480 ColPartition* third_part = it.at_last() ? nullptr : it.data_relative(1);
1482 tprintf(
"Spacings unequal: upper:%d/%d, lower:%d/%d,"
1483 " sizes %d %d %d\n",
1491 if (part->SizesSimilar(*next_part) &&
1498 if (third_part ==
nullptr ||
1499 !next_part->SizesSimilar(*third_part) ||
1506 sp_block_it.add_to_end(it.extract());
1509 tprintf(
"Added line to current block.\n");
1515 if (to_block !=
nullptr) {
1516 to_block_it.add_to_end(to_block);
1517 block_it.add_to_end(to_block->
block);
1519 sp_block_it.set_to_list(&spacing_parts);
1523 tprintf(
"Spacings equal: upper:%d/%d, lower:%d/%d, median:%d/%d\n",
1534 if (pos->
x() < bleft.
x())
1536 if (pos->
x() > tright.
x())
1538 if (pos->
y() < bleft.
y())
1540 if (pos->
y() > tright.
y())
1548 static TO_BLOCK* MoveBlobsToBlock(
bool vertical_text,
int line_spacing,
1550 ColPartition_LIST* block_parts,
1551 ColPartition_LIST* used_parts) {
1557 STATS sizes(0, std::max(block_box.width(), block_box.height()));
1559 ColPartition_IT it(block_parts);
1560 auto* to_block =
new TO_BLOCK(block);
1561 BLOBNBOX_IT blob_it(&to_block->blobs);
1562 ColPartition_IT used_it(used_parts);
1563 for (it.move_to_first(); !it.empty(); it.forward()) {
1564 ColPartition* part = it.extract();
1568 for (BLOBNBOX_C_IT bb_it(part->boxes()); !bb_it.empty();
1571 if (bblob->
owner() != part) {
1572 tprintf(
"Ownership incorrect for blob:");
1576 if (bblob->
owner() ==
nullptr) {
1589 C_OUTLINE_IT ol_it(outlines);
1590 ASSERT_HOST(!text_type || ol_it.data()->pathlength() > 0);
1595 blob_it.add_after_then_move(bblob);
1597 used_it.add_to_end(part);
1599 if (text_type && blob_it.empty()) {
1604 to_block->line_size = sizes.median();
1605 if (vertical_text) {
1607 if (block_width < line_spacing)
1608 line_spacing = block_width;
1609 to_block->line_spacing = static_cast<float>(line_spacing);
1610 to_block->max_blob_size = static_cast<float>(block_width + 1);
1613 if (block_height < line_spacing)
1614 line_spacing = block_height;
1615 to_block->line_spacing = static_cast<float>(line_spacing);
1616 to_block->max_blob_size = static_cast<float>(block_height + 1);
1624 ColPartition_LIST* block_parts,
1625 ColPartition_LIST* used_parts) {
1626 if (block_parts->empty())
1632 ColPartition_IT it(block_parts);
1643 ICOORDELT_LIST vertices;
1644 ICOORDELT_IT vert_it(&vertices);
1646 int min_x = INT32_MAX;
1647 int max_x = -INT32_MAX;
1648 int min_y = INT32_MAX;
1649 int max_y = -INT32_MAX;
1653 ColPartition::LeftEdgeRun(&it, &start, &end);
1655 ColPartition::RightEdgeRun(&it, &start, &end);
1656 ClipCoord(bleft, tright, &start);
1657 ClipCoord(bleft, tright, &end);
1658 vert_it.add_after_then_move(
new ICOORDELT(start));
1659 vert_it.add_after_then_move(
new ICOORDELT(end));
1664 if ((iteration == 0 && it.at_first()) ||
1665 (iteration == 1 && it.at_last())) {
1669 }
while (iteration < 2);
1671 tprintf(
"Making block at (%d,%d)->(%d,%d)\n",
1672 min_x, min_y, max_x, max_y);
1673 auto* block =
new BLOCK(
"",
true, 0, 0, min_x, min_y, max_x, max_y);
1675 return MoveBlobsToBlock(
false, line_spacing, block, block_parts, used_parts);
1682 ColPartition_LIST* block_parts,
1683 ColPartition_LIST* used_parts) {
1684 if (block_parts->empty())
1686 ColPartition_IT it(block_parts);
1689 int line_spacing = block_box.
width();
1691 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1692 block_box += it.data()->bounding_box();
1698 auto* block =
new BLOCK(
"",
true, 0, 0, block_box.
left(), block_box.
bottom(),
1699 block_box.
right(), block_box.
top());
1701 return MoveBlobsToBlock(
true, line_spacing, block, block_parts, used_parts);
1707 BLOBNBOX_C_IT blob_it(&boxes_);
1709 int line_size =
IsVerticalType() ? median_width_ : median_height_;
1711 for (; !blob_it.empty(); blob_it.forward()) {
1712 BLOBNBOX* blob = blob_it.extract();
1716 if (row ==
nullptr) {
1717 row =
new TO_ROW(blob, static_cast<float>(top),
1718 static_cast<float>(bottom),
1719 static_cast<float>(line_size));
1721 row->
add_blob(blob, static_cast<float>(top),
1722 static_cast<float>(bottom),
1723 static_cast<float>(line_size));
1733 part->left_margin_ = left_margin_;
1734 part->right_margin_ = right_margin_;
1735 part->bounding_box_ = bounding_box_;
1736 memcpy(part->special_blobs_densities_, special_blobs_densities_,
1737 sizeof(special_blobs_densities_));
1738 part->median_bottom_ = median_bottom_;
1739 part->median_top_ = median_top_;
1740 part->median_height_ = median_height_;
1741 part->median_left_ = median_left_;
1742 part->median_right_ = median_right_;
1743 part->median_width_ = median_width_;
1744 part->good_width_ = good_width_;
1745 part->good_column_ = good_column_;
1746 part->left_key_tab_ = left_key_tab_;
1747 part->right_key_tab_ = right_key_tab_;
1748 part->type_ = type_;
1749 part->flow_ = flow_;
1750 part->left_key_ = left_key_;
1751 part->right_key_ = right_key_;
1752 part->first_column_ = first_column_;
1753 part->last_column_ = last_column_;
1754 part->owns_blobs_ =
false;
1761 BLOBNBOX_C_IT inserter(copy->
boxes());
1762 BLOBNBOX_C_IT traverser(
boxes());
1763 for (traverser.mark_cycle_pt(); !traverser.cycled_list(); traverser.forward())
1764 inserter.add_after_then_move(traverser.data());
1768 #ifndef GRAPHICS_DISABLED
1776 #endif // GRAPHICS_DISABLED
1779 static char kBlobTypes[
BRT_COUNT + 1] =
"NHSRIUVT";
1784 tprintf(
"ColPart:%c(M%d-%c%d-B%d/%d,%d/%d)->(%dB-%d%c-%dM/%d,%d/%d)"
1785 " w-ok=%d, v-ok=%d, type=%d%c%d, fc=%d, lc=%d, boxes=%d"
1786 " ts=%d bs=%d ls=%d rs=%d\n",
1787 boxes_.empty() ?
'E' :
' ',
1788 left_margin_, left_key_tab_ ?
'T' :
'B',
LeftAtY(y),
1789 bounding_box_.
left(), median_left_,
1790 bounding_box_.
bottom(), median_bottom_,
1791 bounding_box_.
right(),
RightAtY(y), right_key_tab_ ?
'T' :
'B',
1792 right_margin_, median_right_, bounding_box_.
top(), median_top_,
1793 good_width_, good_column_, type_,
1794 kBlobTypes[blob_type_], flow_,
1795 first_column_, last_column_, boxes_.length(),
1796 space_above_, space_below_, space_to_left_, space_to_right_);
1801 tprintf(
"Colors:(%d, %d, %d)%d -> (%d, %d, %d)\n",
1802 color1_[COLOR_RED], color1_[COLOR_GREEN], color1_[COLOR_BLUE],
1803 color1_[L_ALPHA_CHANNEL],
1804 color2_[COLOR_RED], color2_[COLOR_GREEN], color2_[COLOR_BLUE]);
1809 STATS left_stats(0, working_set_count);
1810 STATS right_stats(0, working_set_count);
1815 if (partner->type_ > max_type)
1816 max_type = partner->type_;
1817 if (column_set_ == partner->column_set_) {
1818 left_stats.
add(partner->first_column_, 1);
1819 right_stats.
add(partner->last_column_, 1);
1827 first_column_ = left_stats.
mode();
1828 last_column_ = right_stats.
mode();
1829 if (last_column_ < first_column_)
1830 last_column_ = first_column_;
1835 partner->type_ = max_type;
1836 #if 0 // See TODO above
1837 if (column_set_ == partner->column_set_) {
1838 partner->first_column_ = first_column_;
1839 partner->last_column_ = last_column_;
1880 RefinePartnersInternal(
true, get_desperate, grid);
1881 RefinePartnersInternal(
false, get_desperate, grid);
1885 RefinePartnersByType(
true, &upper_partners_);
1886 RefinePartnersByType(
false, &lower_partners_);
1890 if (!upper_partners_.empty() && !upper_partners_.singleton())
1891 RefinePartnersByOverlap(
true, &upper_partners_);
1892 if (!lower_partners_.empty() && !lower_partners_.singleton())
1893 RefinePartnersByOverlap(
false, &lower_partners_);
1902 void ColPartition::RefinePartnersInternal(
bool upper,
bool get_desperate,
1904 ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
1905 if (!partners->empty() && !partners->singleton()) {
1906 RefinePartnersByType(upper, partners);
1907 if (!partners->empty() && !partners->singleton()) {
1909 RefinePartnerShortcuts(upper, partners);
1910 if (!partners->empty() && !partners->singleton()) {
1914 RefineTextPartnersByMerge(upper,
false, partners, grid);
1915 if (!partners->empty() && !partners->singleton())
1916 RefineTextPartnersByMerge(upper,
true, partners, grid);
1919 if (!partners->empty() && !partners->singleton())
1920 RefinePartnersByOverlap(upper, partners);
1929 void ColPartition::RefinePartnersByType(
bool upper,
1930 ColPartition_CLIST* partners) {
1934 tprintf(
"Refining %d %s partners by type for:\n",
1935 partners->length(), upper ?
"Upper" :
"Lower");
1938 ColPartition_C_IT it(partners);
1944 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1951 partner->RemovePartner(!upper,
this);
1960 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1968 partner->RemovePartner(!upper,
this);
1983 void ColPartition::RefinePartnerShortcuts(
bool upper,
1984 ColPartition_CLIST* partners) {
1985 bool done_any =
false;
1988 ColPartition_C_IT it(partners);
1989 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1993 ColPartition_C_IT it1(upper ? &a->upper_partners_ : &a->lower_partners_);
1994 for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
1999 a->RemovePartner(!upper,
this);
2002 ColPartition_C_IT it2(partners);
2003 for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
2008 b2->RemovePartner(!upper,
this);
2021 }
while (done_any && !partners->empty() && !partners->singleton());
2032 void ColPartition::RefineTextPartnersByMerge(
bool upper,
bool desperate,
2033 ColPartition_CLIST* partners,
2034 ColPartitionGrid* grid) {
2038 tprintf(
"Refining %d %s partners by merge for:\n",
2039 partners->length(), upper ?
"Upper" :
"Lower");
2042 while (!partners->empty() && !partners->singleton()) {
2045 ColPartition_C_IT it(partners);
2049 ColPartition_CLIST candidates;
2050 ColPartition_C_IT cand_it(&candidates);
2051 for (it.forward(); !it.at_first(); it.forward()) {
2053 if (part->first_column_ == candidate->last_column_ &&
2054 part->last_column_ == candidate->first_column_)
2055 cand_it.add_after_then_move(it.data());
2057 int overlap_increase;
2058 ColPartition* candidate = grid->BestMergeCandidate(part, &candidates, debug,
2059 nullptr, &overlap_increase);
2060 if (candidate !=
nullptr && (overlap_increase <= 0 || desperate)) {
2062 tprintf(
"Merging:hoverlap=%d, voverlap=%d, OLI=%d\n",
2063 part->HCoreOverlap(*candidate), part->VCoreOverlap(*candidate),
2067 grid->RemoveBBox(candidate);
2068 grid->RemoveBBox(part);
2069 part->Absorb(candidate,
nullptr);
2071 grid->InsertBBox(
true,
true, part);
2072 if (overlap_increase > 0)
2073 part->desperately_merged_ =
true;
2082 void ColPartition::RefinePartnersByOverlap(
bool upper,
2083 ColPartition_CLIST* partners) {
2087 tprintf(
"Refining %d %s partners by overlap for:\n",
2088 partners->length(), upper ?
"Upper" :
"Lower");
2091 ColPartition_C_IT it(partners);
2094 int best_overlap = 0;
2095 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2097 int overlap = std::min(bounding_box_.
right(), partner->bounding_box_.right())
2098 - std::max(bounding_box_.
left(), partner->bounding_box_.left());
2099 if (overlap > best_overlap) {
2100 best_overlap = overlap;
2101 best_partner = partner;
2105 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2107 if (partner != best_partner) {
2112 partner->RemovePartner(!upper,
this);
2119 bool ColPartition::ThisPartitionBetter(
BLOBNBOX* bbox,
2120 const ColPartition& other) {
2123 int left = box.
left();
2124 int right = box.
right();
2125 if (left < left_margin_ || right > right_margin_)
2127 if (left < other.left_margin_ || right > other.right_margin_)
2129 int top = box.
top();
2130 int bottom = box.
bottom();
2131 int this_overlap = std::min(top, median_top_) - std::max(bottom, median_bottom_);
2132 int other_overlap = std::min(top, other.median_top_) -
2133 std::max(bottom, other.median_bottom_);
2134 int this_miss = median_top_ - median_bottom_ - this_overlap;
2135 int other_miss = other.median_top_ - other.median_bottom_ - other_overlap;
2137 tprintf(
"Unique on (%d,%d)->(%d,%d) overlap %d/%d, miss %d/%d, mt=%d/%d\n",
2139 this_overlap, other_overlap, this_miss, other_miss,
2140 median_top_, other.median_top_);
2142 if (this_miss < other_miss)
2144 if (this_miss > other_miss)
2146 if (this_overlap > other_overlap)
2148 if (this_overlap < other_overlap)
2150 return median_top_ >= other.median_top_;
2157 static int MedianSpacing(
int page_height, ColPartition_IT it) {
2158 STATS stats(0, page_height);
2159 while (!it.cycled_list()) {
2160 ColPartition* part = it.data();
2162 stats.add(part->bottom_spacing(), 1);
2163 stats.add(part->top_spacing(), 1);
2165 return static_cast<int>(stats.median() + 0.5);
2179 return (last_column_ >= part.first_column_) &&
2180 (first_column_ <= part.last_column_);
2186 void ColPartition::SmoothSpacings(
int resolution,
int page_height,
2187 ColPartition_LIST* parts) {
2195 ColPartition_IT it(parts);
2202 int median_space = MedianSpacing(page_height, it);
2203 ColPartition_IT start_it(it);
2204 ColPartition_IT end_it(it);
2205 for (
int i = 0; i < PN_COUNT; ++i) {
2206 if (i < PN_UPPER || it.cycled_list()) {
2207 neighbourhood[i] =
nullptr;
2211 neighbourhood[i] = it.data();
2215 while (neighbourhood[PN_UPPER] !=
nullptr) {
2237 if (neighbourhood[PN_LOWER] ==
nullptr ||
2238 (!neighbourhood[PN_UPPER]->SpacingsEqual(*neighbourhood[PN_LOWER],
2240 !OKSpacingBlip(resolution, median_space, neighbourhood) &&
2241 (!OKSpacingBlip(resolution, median_space, neighbourhood - 1) ||
2242 !neighbourhood[PN_LOWER]->SpacingEqual(median_space, resolution)) &&
2243 (!OKSpacingBlip(resolution, median_space, neighbourhood + 1) ||
2244 !neighbourhood[PN_UPPER]->SpacingEqual(median_space, resolution)))) {
2247 ColPartition_IT sum_it(start_it);
2249 double total_bottom = 0.0;
2250 double total_top = 0.0;
2251 int total_count = 0;
2254 while (upper != last_part) {
2255 total_bottom += upper->bottom_spacing();
2256 total_top += upper->top_spacing();
2259 upper = sum_it.data();
2261 if (total_count > 0) {
2263 int top_spacing = static_cast<int>(total_top / total_count + 0.5);
2264 int bottom_spacing = static_cast<int>(total_bottom / total_count + 0.5);
2266 tprintf(
"Spacing run ended. Cause:");
2267 if (neighbourhood[PN_LOWER] ==
nullptr) {
2270 tprintf(
"Spacing change. Spacings:\n");
2271 for (
int i = 0; i < PN_COUNT; ++i) {
2272 if (neighbourhood[i] ==
nullptr) {
2274 if (i > 0 && neighbourhood[i - 1] !=
nullptr) {
2279 tprintf(
" nullptr lower partner:\n");
2285 tprintf(
"Top = %d, bottom = %d\n",
2294 upper = sum_it.data();
2295 while (upper != last_part) {
2303 upper = sum_it.data();
2310 median_space = MedianSpacing(page_height, end_it);
2313 for (
int j = 1; j < PN_COUNT; ++j) {
2314 neighbourhood[j - 1] = neighbourhood[j];
2316 if (it.cycled_list()) {
2317 neighbourhood[PN_COUNT - 1] =
nullptr;
2319 neighbourhood[PN_COUNT - 1] = it.data();
2329 bool ColPartition::OKSpacingBlip(
int resolution,
int median_spacing,
2330 ColPartition** parts) {
2331 if (parts[PN_UPPER] ==
nullptr || parts[PN_LOWER] ==
nullptr)
2335 return parts[PN_UPPER]->SummedSpacingOK(*parts[PN_LOWER],
2336 median_spacing, resolution) &&
2337 ((parts[PN_ABOVE1] !=
nullptr &&
2338 parts[PN_ABOVE1]->SpacingEqual(median_spacing, resolution)) ||
2339 (parts[PN_BELOW1] !=
nullptr &&
2340 parts[PN_BELOW1]->SpacingEqual(median_spacing, resolution)));
2345 bool ColPartition::SpacingEqual(
int spacing,
int resolution)
const {
2346 int bottom_error = BottomSpacingMargin(resolution);
2347 int top_error = TopSpacingMargin(resolution);
2348 return NearlyEqual(bottom_spacing_, spacing, bottom_error) &&
2354 bool ColPartition::SpacingsEqual(
const ColPartition& other,
2355 int resolution)
const {
2356 int bottom_error = std::max(BottomSpacingMargin(resolution),
2357 other.BottomSpacingMargin(resolution));
2358 int top_error = std::max(TopSpacingMargin(resolution),
2359 other.TopSpacingMargin(resolution));
2360 return NearlyEqual(bottom_spacing_, other.bottom_spacing_, bottom_error) &&
2361 (
NearlyEqual(top_spacing_, other.top_spacing_, top_error) ||
2362 NearlyEqual(top_spacing_ + other.top_spacing_, bottom_spacing_ * 2,
2369 bool ColPartition::SummedSpacingOK(
const ColPartition& other,
2370 int spacing,
int resolution)
const {
2371 int bottom_error = std::max(BottomSpacingMargin(resolution),
2372 other.BottomSpacingMargin(resolution));
2373 int top_error = std::max(TopSpacingMargin(resolution),
2374 other.TopSpacingMargin(resolution));
2375 int bottom_total = bottom_spacing_ + other.bottom_spacing_;
2376 int top_total = top_spacing_ + other.top_spacing_;
2377 return (
NearlyEqual(spacing, bottom_total, bottom_error) &&
2379 (
NearlyEqual(spacing * 2, bottom_total, bottom_error) &&
2385 int ColPartition::BottomSpacingMargin(
int resolution)
const {
2391 int ColPartition::TopSpacingMargin(
int resolution)
const {
2393 BottomSpacingMargin(resolution);
2398 bool ColPartition::SizesSimilar(
const ColPartition& other)
const {
2399 return median_height_ <= other.median_height_ *
kMaxSizeRatio &&
2406 static bool UpdateLeftMargin(
const ColPartition& part,
2407 int* margin_left,
int* margin_right) {
2408 const TBOX& part_box = part.bounding_box();
2409 int top = part_box.
top();
2410 int bottom = part_box.
bottom();
2411 int tl_key = part.SortKey(part.left_margin(), top);
2412 int tr_key = part.SortKey(part_box.
left(), top);
2413 int bl_key = part.SortKey(part.left_margin(), bottom);
2414 int br_key = part.SortKey(part_box.
left(), bottom);
2415 int left_key = std::max(tl_key, bl_key);
2416 int right_key = std::min(tr_key, br_key);
2417 if (left_key <= *margin_right && right_key >= *margin_left) {
2419 *margin_right = std::min(*margin_right, right_key);
2420 *margin_left = std::max(*margin_left, left_key);
2431 void ColPartition::LeftEdgeRun(ColPartition_IT* part_it,
2435 int start_y = part->bounding_box_.top();
2436 if (!part_it->at_first()) {
2437 int prev_bottom = part_it->data_relative(-1)->bounding_box_.bottom();
2438 if (prev_bottom < start_y)
2439 start_y = prev_bottom;
2440 else if (prev_bottom > start_y)
2441 start_y = (start_y + prev_bottom) / 2;
2443 int end_y = part->bounding_box_.bottom();
2444 int margin_right = INT32_MAX;
2445 int margin_left = -INT32_MAX;
2446 UpdateLeftMargin(*part, &margin_left, &margin_right);
2449 part = part_it->data();
2450 }
while (!part_it->at_first() &&
2451 UpdateLeftMargin(*part, &margin_left, &margin_right));
2455 int next_margin_right = INT32_MAX;
2456 int next_margin_left = -INT32_MAX;
2457 UpdateLeftMargin(*part, &next_margin_left, &next_margin_right);
2458 if (next_margin_left > margin_right) {
2459 ColPartition_IT next_it(*part_it);
2462 part = next_it.data();
2463 }
while (!next_it.at_first() &&
2464 UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2468 part_it->backward();
2469 part = part_it->data();
2470 }
while (part != start_part &&
2471 UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2475 part = part_it->data_relative(-1);
2476 end_y = part->bounding_box_.bottom();
2477 if (!part_it->at_first() && part_it->data()->bounding_box_.top() < end_y)
2478 end_y = (end_y + part_it->data()->bounding_box_.top()) / 2;
2479 start->
set_y(start_y);
2480 start->
set_x(part->XAtY(margin_right, start_y));
2482 end->
set_x(part->XAtY(margin_right, end_y));
2484 tprintf(
"Left run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2485 start_y, end_y, part->XAtY(margin_left, end_y),
2486 end->
x(), part->left_margin_, part->bounding_box_.left());
2492 static bool UpdateRightMargin(
const ColPartition& part,
2493 int* margin_left,
int* margin_right) {
2494 const TBOX& part_box = part.bounding_box();
2495 int top = part_box.
top();
2496 int bottom = part_box.
bottom();
2497 int tl_key = part.SortKey(part_box.
right(), top);
2498 int tr_key = part.SortKey(part.right_margin(), top);
2499 int bl_key = part.SortKey(part_box.
right(), bottom);
2500 int br_key = part.SortKey(part.right_margin(), bottom);
2501 int left_key = std::max(tl_key, bl_key);
2502 int right_key = std::min(tr_key, br_key);
2503 if (left_key <= *margin_right && right_key >= *margin_left) {
2505 *margin_right = std::min(*margin_right, right_key);
2506 *margin_left = std::max(*margin_left, left_key);
2518 void ColPartition::RightEdgeRun(ColPartition_IT* part_it,
2522 int start_y = part->bounding_box_.bottom();
2523 if (!part_it->at_last()) {
2524 int next_y = part_it->data_relative(1)->bounding_box_.top();
2525 if (next_y > start_y)
2527 else if (next_y < start_y)
2528 start_y = (start_y + next_y) / 2;
2530 int end_y = part->bounding_box_.top();
2531 int margin_right = INT32_MAX;
2532 int margin_left = -INT32_MAX;
2533 UpdateRightMargin(*part, &margin_left, &margin_right);
2535 part_it->backward();
2536 part = part_it->data();
2537 }
while (!part_it->at_last() &&
2538 UpdateRightMargin(*part, &margin_left, &margin_right));
2541 int next_margin_right = INT32_MAX;
2542 int next_margin_left = -INT32_MAX;
2543 UpdateRightMargin(*part, &next_margin_left, &next_margin_right);
2544 if (next_margin_right < margin_left) {
2545 ColPartition_IT next_it(*part_it);
2548 part = next_it.data();
2549 }
while (!next_it.at_last() &&
2550 UpdateRightMargin(*part, &next_margin_left,
2551 &next_margin_right));
2556 part = part_it->data();
2557 }
while (part != start_part &&
2558 UpdateRightMargin(*part, &next_margin_left,
2559 &next_margin_right));
2560 part_it->backward();
2563 part = part_it->data_relative(1);
2564 end_y = part->bounding_box().top();
2565 if (!part_it->at_last() &&
2566 part_it->data()->bounding_box_.bottom() > end_y)
2567 end_y = (end_y + part_it->data()->bounding_box_.bottom()) / 2;
2568 start->
set_y(start_y);
2569 start->
set_x(part->XAtY(margin_left, start_y));
2571 end->
set_x(part->XAtY(margin_left, end_y));
2573 tprintf(
"Right run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2574 start_y, end_y, end->
x(), part->XAtY(margin_right, end_y),
2575 part->bounding_box_.right(), part->right_margin_);