20 #include "config_auto.h"
123 bool any_done =
false;
125 bool merge_done =
false;
135 if (!box_cb(part, &box))
138 ColPartition_CLIST merge_candidates;
139 FindMergeCandidates(part, box, debug, &merge_candidates);
141 int overlap_increase;
145 if (neighbour !=
nullptr && overlap_increase <= 0) {
147 tprintf(
"Merging:hoverlap=%d, voverlap=%d, OLI=%d\n",
156 part->
Absorb(neighbour,
nullptr);
160 }
else if (neighbour !=
nullptr) {
162 tprintf(
"Overlapped when merged with increase %d: ", overlap_increase);
166 tprintf(
"No candidate neighbour returned\n");
168 }
while (merge_done);
181 if (candidate == part)
188 tprintf(
"Examining merge candidate:");
194 if (h_dist >= std::max(part_box.
width(), c_box.
width()) / 2) {
196 tprintf(
"Too far away: h_dist = %d\n", h_dist);
202 if (v_dist >= std::max(part_box.
height(), c_box.
height()) / 2) {
204 tprintf(
"Too far away: v_dist = %d\n", v_dist);
213 tprintf(
"Candidate fails overlap and diacritic tests!\n");
225 static int IncreaseInOverlap(
const ColPartition* merge1,
226 const ColPartition* merge2,
228 ColPartition_CLIST* parts) {
229 ASSERT_HOST(merge1 !=
nullptr && merge2 !=
nullptr);
231 ColPartition_C_IT it(parts);
232 TBOX merged_box(merge1->bounding_box());
233 merged_box += merge2->bounding_box();
234 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
235 ColPartition* part = it.data();
236 if (part == merge1 || part == merge2)
238 TBOX part_box = part->bounding_box();
241 if (overlap_area > 0 && !part->OKMergeOverlap(*merge1, *merge2,
242 ok_overlap,
false)) {
243 total_area += overlap_area;
245 overlap_area = part_box.
intersection(merge1->bounding_box()).area();
246 if (overlap_area > 0)
247 total_area -= overlap_area;
249 overlap_area = intersection_box.
area();
250 if (overlap_area > 0) {
251 total_area -= overlap_area;
253 intersection_box &= merge1->bounding_box();
254 overlap_area = intersection_box.
area();
255 if (overlap_area > 0)
256 total_area += overlap_area;
284 static bool TestCompatibleCandidates(
const ColPartition& part,
bool debug,
285 ColPartition_CLIST* candidates) {
286 ColPartition_C_IT it(candidates);
287 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
288 ColPartition* candidate = it.data();
289 if (!candidate->OKDiacriticMerge(part,
false)) {
290 ColPartition_C_IT it2(it);
291 for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
292 ColPartition* candidate2 = it2.data();
293 if (candidate2 != candidate &&
294 !OKMergeCandidate(candidate, candidate2,
false)) {
296 tprintf(
"NC overlap failed:Candidate:");
297 candidate2->bounding_box().print();
298 tprintf(
"fails to be a good merge with:");
299 candidate->bounding_box().print();
313 int total_overlap = 0;
319 ColPartition_CLIST neighbors;
322 ColPartition_C_IT n_it(&neighbors);
323 bool any_part_overlap =
false;
324 for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
325 const TBOX& n_box = n_it.data()->bounding_box();
327 if (overlap > 0 && overlap_grid !=
nullptr) {
328 if (*overlap_grid ==
nullptr) {
331 (*overlap_grid)->InsertBBox(
true,
true, n_it.data()->ShallowCopy());
332 if (!any_part_overlap) {
333 (*overlap_grid)->InsertBBox(
true,
true, part->
ShallowCopy());
336 any_part_overlap =
true;
337 total_overlap += overlap;
340 return total_overlap;
348 ColPartition_CLIST* parts) {
353 if (part != not_this)
354 parts->add_sorted(SortByBoxLeft<ColPartition>,
true, part);
400 const ColPartition* part, ColPartition_CLIST* candidates,
bool debug,
402 int* overlap_increase) {
403 if (overlap_increase !=
nullptr)
404 *overlap_increase = 0;
405 if (candidates->empty())
414 ColPartition_C_IT it(candidates);
417 TBOX full_box(part_box);
418 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
423 ColPartition_CLIST neighbours;
428 tprintf(
"Finding best merge candidate from %d, %d neighbours for box:",
429 candidates->length(), neighbours.length());
437 ColPartition_CLIST non_candidate_neighbours;
438 non_candidate_neighbours.set_subtract(SortByBoxLeft<ColPartition>,
true,
439 &neighbours, candidates);
440 int worst_nc_increase = 0;
441 int best_increase = INT32_MAX;
443 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
445 if (confirm_cb !=
nullptr && !confirm_cb(part, candidate)) {
447 tprintf(
"Candidate not confirmed:");
452 int increase = IncreaseInOverlap(part, candidate, ok_overlap, &neighbours);
454 if (best_candidate ==
nullptr || increase < best_increase) {
455 best_candidate = candidate;
456 best_increase = increase;
459 tprintf(
"New best merge candidate has increase %d, area %d, over box:",
460 increase, best_area);
464 }
else if (increase == best_increase) {
466 if (area < best_area) {
468 best_candidate = candidate;
471 increase = IncreaseInOverlap(part, candidate, ok_overlap,
472 &non_candidate_neighbours);
473 if (increase > worst_nc_increase)
474 worst_nc_increase = increase;
476 if (best_increase > 0) {
483 if (worst_nc_increase < best_increase &&
484 TestCompatibleCandidates(*part, debug, candidates)) {
485 best_increase = worst_nc_increase;
488 if (overlap_increase !=
nullptr)
489 *overlap_increase = best_increase;
490 return best_candidate;
496 ColPartition_LIST* part_list) {
509 ColPartition_LIST* big_parts) {
522 int unresolved_overlaps = 0;
526 if (neighbour == part)
540 if (!shrunken.
overlap(neighbour_box) &&
545 RemoveBadBox(excluded, part, big_parts);
550 }
else if (box.
contains(neighbour_box)) {
551 ++unresolved_overlaps;
562 RemoveBadBox(excluded, neighbour, big_parts);
571 if (neighbour_overlap_count <= part_overlap_count ||
575 if (split_blob !=
nullptr) {
584 if (split_blob !=
nullptr) {
591 if (right_part !=
nullptr) {
598 if (unresolved_overlaps > 2 && part->
IsSingleton()) {
601 ColPartition_IT big_it(big_parts);
603 big_it.add_to_end(part);
626 bool any_changed =
false;
632 if (SmoothRegionType(nontext_map, im_box, rotation, debug, part))
641 ColPartition_LIST parts;
642 ColPartition_IT part_it(&parts);
648 part_it.add_after_then_move(part);
655 for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
656 part = part_it.extract();
669 TO_BLOCK_LIST* to_blocks) {
670 TO_BLOCK_IT to_block_it(to_blocks);
671 BLOCK_IT block_it(blocks);
673 ColPartition_LIST parts;
674 ColPartition_IT part_it(&parts);
680 part_it.add_after_then_move(part);
693 if (row ==
nullptr) {
701 auto* to_block =
new TO_BLOCK(block);
702 TO_ROW_IT row_it(to_block->get_rows());
703 row_it.add_after_then_move(row);
707 to_block->line_size = static_cast<float>(median_width);
708 to_block->line_spacing = static_cast<float>(box.
width());
709 to_block->max_blob_size = static_cast<float>(box.
width() + 1);
711 to_block->line_size = static_cast<float>(median_height);
712 to_block->line_spacing = static_cast<float>(box.
height());
713 to_block->max_blob_size = static_cast<float>(box.
height() + 1);
715 if (to_block->line_size == 0) to_block->line_size = 1;
716 block_it.add_to_end(block);
717 to_block_it.add_to_end(to_block);
730 ColPartition_LIST parts;
731 ColPartition_IT part_it(&parts);
737 part_it.add_after_then_move(part);
745 for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
746 part = part_it.extract();
762 if (left_line !=
nullptr && !left_line->
IsLeftTab())
764 if (left_line !=
nullptr && left_line->
IsLeftTab())
767 if (right_line !=
nullptr && !right_line->
IsRightTab())
769 if (right_line !=
nullptr && right_line->
IsRightTab())
778 auto* part_lists =
new ColPartition_LIST[
gridheight()];
785 bool any_parts_found =
false;
793 ColPartition_IT part_it(&part_lists[grid_y]);
794 part_it.add_to_end(part);
795 any_parts_found =
true;
798 if (any_parts_found) {
799 for (
int grid_y = 0; grid_y <
gridheight(); ++grid_y) {
801 if (!part_lists[grid_y].empty()) {
807 delete [] part_lists;
808 return any_parts_found;
832 if (single_column_part ==
nullptr) {
836 single_column_part->
CopyLeftTab(*single_column_part,
false);
837 single_column_part->
CopyRightTab(*single_column_part,
false);
847 if (single_column_part !=
nullptr) {
871 BLOBNBOX_IT im_blob_it(im_blobs);
872 ColPartition_LIST dead_parts;
873 ColPartition_IT dead_part_it(&dead_parts);
881 bool any_blobs_moved =
false;
883 BLOBNBOX_C_IT blob_it(part->
boxes());
884 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
886 im_blob_it.add_after_then_move(blob);
890 BLOBNBOX_C_IT blob_it(part->
boxes());
891 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
899 any_blobs_moved =
true;
908 BLOBNBOX_C_IT blob_it(part->
boxes());
910 dead_part_it.add_to_end(part);
912 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
916 delete blob->
cblob();
920 }
else if (any_blobs_moved) {
935 ColPartition_LIST saved_parts;
936 ColPartition_IT part_it(&saved_parts);
942 part_it.add_to_end(part);
947 for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
948 part = part_it.extract();
968 ? best_columns[gsearch.
GridY()]
970 FindPartitionMargins(columns, part);
973 tprintf(
"Computed margins for part:");
985 ColPartition_LIST* parts) {
986 ColPartition_IT part_it(parts);
987 for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {
990 if (best_columns !=
nullptr) {
995 columns = best_columns[grid_y];
997 FindPartitionMargins(columns, part);
1003 ColPartition_LIST dead_parts;
1004 ColPartition_IT dead_it(&dead_parts);
1010 dead_it.add_to_end(part);
1069 for (
int upper = 0; upper < 2; ++upper) {
1073 for (partner_it.mark_cycle_pt(); !partner_it.cycled_list();
1074 partner_it.forward()) {
1080 if (!partner_it.cycled_list())
continue;
1082 for (partner_it.mark_cycle_pt(); !partner_it.cycled_list();
1083 partner_it.forward()) {
1088 tprintf(
"Finding figure captions for image part:");
1090 tprintf(
"Considering partner:");
1091 partner_box.
print();
1093 if (partner_box.
left() >= part_box.
left() &&
1095 int dist = partner_box.
y_gap(part_box);
1096 if (best_caption ==
nullptr || dist < best_dist) {
1098 best_caption = partner;
1104 if (best_caption !=
nullptr) {
1106 tprintf(
"Best caption candidate:");
1107 best_caption->bounding_box().print();
1113 int biggest_gap = 0;
1114 int smallest_gap = INT16_MAX;
1115 int total_height = 0;
1116 int mean_height = 0;
1119 for (
ColPartition* partner = best_caption; partner !=
nullptr &&
1121 partner = next_partner) {
1122 if (!partner->IsTextType()) {
1123 end_partner = partner;
1129 if (next_partner !=
nullptr) {
1132 if (gap > biggest_gap) {
1134 end_partner = next_partner;
1135 mean_height = total_height / line_count;
1136 }
else if (gap < smallest_gap) {
1147 tprintf(
"Line count=%d, biggest gap %d, smallest%d, mean height %d\n",
1148 line_count, biggest_gap, smallest_gap, mean_height);
1149 if (end_partner !=
nullptr) {
1155 end_partner =
nullptr;
1158 for (
ColPartition* partner = best_caption; partner !=
nullptr &&
1159 partner != end_partner;
1160 partner = next_partner) {
1162 partner->SetBlobTypes();
1164 tprintf(
"Set caption type for partition:");
1165 partner->bounding_box().print();
1202 int height = top - bottom;
1203 int mid_y = (bottom + top) / 2;
1209 int best_dist = INT32_MAX;
1211 if (neighbour == part || neighbour->
type() ==
PT_NOISE)
1215 int neighbour_y = (neighbour_bottom + neighbour_top) / 2;
1216 if (upper != (neighbour_y > mid_y))
1221 if (best_neighbour ==
nullptr)
1222 best_neighbour = neighbour;
1225 int dist = upper ? neighbour_bottom - top : bottom - neighbour_top;
1227 if (dist < best_dist) {
1229 best_neighbour = neighbour;
1235 if (best_neighbour !=
nullptr)
1248 int width = right >= left ? right - left : -1;
1249 int mid_x = (left + right) / 2;
1255 int best_dist = INT32_MAX;
1256 while ((neighbour = hsearch.
NextSideSearch(to_the_left)) !=
nullptr) {
1257 if (neighbour == part || neighbour->
type() ==
PT_NOISE)
1261 int neighbour_x = (neighbour_left + neighbour_right) / 2;
1262 if (to_the_left != (neighbour_x < mid_x))
1268 int dist = to_the_left ? left - neighbour_right : neighbour_left - right;
1270 if (dist < best_dist || best_neighbour ==
nullptr) {
1272 best_neighbour = neighbour;
1280 if (best_neighbour !=
nullptr)
1281 part->
AddPartner(to_the_left, best_neighbour);
1297 get_desperate,
this);
1310 void ColPartitionGrid::FindMergeCandidates(
const ColPartition* part,
1311 const TBOX& search_box,
bool debug,
1312 ColPartition_CLIST* candidates) {
1318 rsearch.SetUniqueMode(
true);
1319 rsearch.StartRectSearch(search_box);
1321 while ((candidate = rsearch.NextRectSearch()) !=
nullptr) {
1322 if (!OKMergeCandidate(part, candidate, debug))
1339 TBOX merged_box(part_box);
1340 merged_box += c_box;
1342 msearch.SetUniqueMode(
true);
1343 msearch.StartRectSearch(merged_box);
1345 while ((neighbour = msearch.NextRectSearch()) !=
nullptr) {
1346 if (neighbour == part || neighbour == candidate)
1348 if (neighbour->
OKMergeOverlap(*part, *candidate, ok_overlap,
false))
1355 !OKMergeCandidate(part, neighbour,
false) &&
1356 !OKMergeCandidate(candidate, neighbour,
false))
1359 if (neighbour !=
nullptr) {
1361 tprintf(
"Combined box overlaps another that is not OK despite"
1362 " allowance of %d:", ok_overlap);
1365 OKMergeCandidate(part, neighbour,
true);
1367 OKMergeCandidate(candidate, neighbour,
true);
1379 candidates->add_sorted(SortByBoxLeft<ColPartition>,
true, candidate);
1394 bool ColPartitionGrid::SmoothRegionType(Pix* nontext_map,
1396 const FCOORD& rerotation,
1398 ColPartition* part) {
1399 const TBOX& part_box = part->bounding_box();
1401 tprintf(
"Smooothing part at:");
1405 int best_dist = INT32_MAX;
1406 int max_dist = std::min(part_box.
width(), part_box.
height());
1409 bool any_image =
false;
1410 bool all_image =
true;
1413 auto dir = static_cast<BlobNeighbourDir>(d);
1415 rerotation, debug, *part,
1418 tprintf(
"Result in dir %d = %d at dist %d\n", dir,
type, dist);
1429 if (best_dist > max_dist)
1436 if (best_type ==
BRT_TEXT && !any_image) {
1446 if (new_type != part->blob_type() || new_flow != part->flow()) {
1447 part->set_flow(new_flow);
1448 part->set_blob_type(new_type);
1449 part->SetBlobTypes();
1464 const TBOX& part_box,
1468 *search_box = part_box;
1471 int padding = std::min(part_box.
height(), part_box.
width());
1472 padding = std::max(padding, min_padding);
1474 search_box->
pad(padding, padding);
1477 switch (direction) {
1480 *dist_scaling =
ICOORD(2, 1);
1484 *dist_scaling =
ICOORD(1, 2);
1488 *dist_scaling =
ICOORD(2, 1);
1492 *dist_scaling =
ICOORD(1, 2);
1520 const TBOX& im_box,
const FCOORD& rerotation,
1521 bool debug,
const ColPartition& part,
int* best_distance) {
1523 const TBOX& part_box = part.bounding_box();
1526 ComputeSearchBoxAndScaling(direction, part_box,
gridsize(),
1527 &search_box, &dist_scaling);
1532 AccumulatePartDistances(part, dist_scaling, search_box,
1533 nontext_map, im_box, rerotation, debug, dists);
1538 memset(counts, 0,
sizeof(counts[0]) *
NPT_COUNT);
1546 min_dist = INT32_MAX;
1548 if (counts[i] < dists[i].size() && dists[i][counts[i]] < min_dist)
1549 min_dist = dists[i][counts[i]];
1553 while (counts[i] < dists[i].size() && dists[i][counts[i]] <= min_dist)
1556 *best_distance = min_dist;
1558 tprintf(
"Totals: htext=%d+%d, vtext=%d+%d, image=%d+%d, at dist=%d\n",
1561 counts[
NPT_IMAGE], image_bias, min_dist);
1569 if (image_count > 0 &&
1590 }
while (min_dist < INT32_MAX);
1601 void ColPartitionGrid::AccumulatePartDistances(
const ColPartition& base_part,
1602 const ICOORD& dist_scaling,
1603 const TBOX& search_box,
1606 const FCOORD& rerotation,
1609 const TBOX& part_box = base_part.bounding_box();
1611 rsearch.SetUniqueMode(
true);
1612 rsearch.StartRectSearch(search_box);
1613 ColPartition* neighbour;
1616 while ((neighbour = rsearch.NextRectSearch()) !=
nullptr) {
1617 if (neighbour->IsUnMergeableType() ||
1618 !base_part.ConfirmNoTabViolation(*neighbour) ||
1619 neighbour == &base_part)
1621 TBOX nbox = neighbour->bounding_box();
1629 int x_gap = std::max(part_box.
x_gap(nbox), 0);
1630 int y_gap = std::max(part_box.
y_gap(nbox), 0);
1631 int n_dist = x_gap * dist_scaling.
x() + y_gap* dist_scaling.
y();
1633 tprintf(
"Part has x-gap=%d, y=%d, dist=%d at:",
1634 x_gap, y_gap, n_dist);
1656 if (debug)
tprintf(
"Weak %d\n", n_boxes);
1659 if (debug)
tprintf(
"Image %d\n", n_boxes);
1661 if (count_vector !=
nullptr) {
1662 for (
int i = 0; i < n_boxes; ++i)
1677 void ColPartitionGrid::FindPartitionMargins(ColPartitionSet* columns,
1678 ColPartition* part) {
1681 int y = part->MidY();
1683 int left_margin =
bleft().
x();
1684 int right_margin =
tright().
x();
1685 if (columns !=
nullptr) {
1686 ColPartition* column = columns->ColumnContaining(box.
left(), y);
1687 if (column !=
nullptr)
1688 left_margin = column->LeftAtY(y);
1689 column = columns->ColumnContaining(box.
right(), y);
1690 if (column !=
nullptr)
1691 right_margin = column->RightAtY(y);
1696 left_margin = FindMargin(box.
left() + box.
height(),
true, left_margin,
1698 part->set_left_margin(left_margin);
1700 right_margin = FindMargin(box.
right() - box.
height(),
false, right_margin,
1702 part->set_right_margin(right_margin);
1708 int ColPartitionGrid::FindMargin(
int x,
bool right_to_left,
int x_limit,
1709 int y_bottom,
int y_top,
1710 const ColPartition* not_this) {
1711 int height = y_top - y_bottom;
1714 side_search.SetUniqueMode(
true);
1715 side_search.StartSideSearch(x, y_bottom, y_top);
1717 while ((part = side_search.NextSideSearch(right_to_left)) !=
nullptr) {
1719 if (part == not_this)
1723 TBOX box = part->bounding_box();
1724 int min_overlap = std::min(height, static_cast<int>(box.
height()));
1726 int y_overlap = std::min(y_top, static_cast<int>(box.
top())) - std::max(y_bottom, static_cast<int>(box.
bottom()));
1727 if (y_overlap < min_overlap)
1730 int x_edge = right_to_left ? box.
right() : box.
left();
1731 if ((x_edge < x) != right_to_left)
1734 if ((x_edge < x_limit) == right_to_left)