22 #include "config_auto.h"
56 static BOOL_VAR(textord_tabfind_show_initial_partitions,
57 false,
"Show partition bounds");
58 static BOOL_VAR(textord_tabfind_show_reject_blobs,
59 false,
"Show blobs rejected as noise");
60 static INT_VAR(textord_tabfind_show_partitions, 0,
61 "Show partition bounds, waiting if >1");
62 static BOOL_VAR(textord_tabfind_show_columns,
false,
"Show column bounds");
63 static BOOL_VAR(textord_tabfind_show_blocks,
false,
"Show final block bounds");
64 static BOOL_VAR(textord_tabfind_find_tables,
true,
"run table detection");
66 ScrollView* ColumnFinder::blocks_win_ =
nullptr;
76 int resolution,
bool cjk_script,
77 double aligned_gap_fraction,
78 TabVector_LIST* vlines, TabVector_LIST* hlines,
79 int vertical_x,
int vertical_y)
80 :
TabFind(gridsize, bleft, tright, vlines, vertical_x, vertical_y,
82 cjk_script_(cjk_script),
84 mean_column_gap_(tright.x() - bleft.x()),
85 tabfind_aligned_gap_fraction_(aligned_gap_fraction),
87 reskew_(1.0f, 0.0f), rotation_(1.0f, 0.0f), rerotate_(1.0f, 0.0f),
88 text_rotation_(0.0f, 0.0f),
89 best_columns_(nullptr), stroke_width_(nullptr),
90 part_grid_(gridsize, bleft, tright), nontext_map_(nullptr),
91 projection_(resolution),
92 denorm_(nullptr), input_blobs_win_(nullptr), equation_detect_(nullptr) {
93 TabVector_IT h_it(&horizontal_lines_);
94 h_it.add_list_after(hlines);
99 delete [] best_columns_;
100 delete stroke_width_;
101 delete input_blobs_win_;
102 pixDestroy(&nontext_map_);
103 while (denorm_ !=
nullptr) {
104 DENORM* dead_denorm = denorm_;
105 denorm_ = const_cast<DENORM*>(denorm_->
predecessor());
111 ColPartition_IT part_it(&noise_parts_);
112 for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {
119 part_it.set_to_list(&good_parts_);
120 for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {
128 BLOBNBOX_IT bb_it(&image_bblobs_);
129 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
131 delete bblob->
cblob();
146 delete stroke_width_;
150 #ifndef GRAPHICS_DISABLED
151 if (textord_tabfind_show_blocks) {
152 input_blobs_win_ =
MakeWindow(0, 0,
"Filtered Input Blobs");
155 #endif // GRAPHICS_DISABLED
157 pixDestroy(&nontext_map_);
163 photo_mask_pix, input_block);
167 stroke_width_->
Clear();
181 BLOBNBOX_CLIST* osd_blobs) {
198 bool vertical_text_lines,
199 int recognition_rotation) {
200 const FCOORD anticlockwise90(0.0f, 1.0f);
201 const FCOORD clockwise90(0.0f, -1.0f);
202 const FCOORD rotation180(-1.0f, 0.0f);
203 const FCOORD norotation(1.0f, 0.0f);
205 text_rotation_ = norotation;
208 rotation_ = norotation;
209 if (recognition_rotation == 1) {
210 rotation_ = anticlockwise90;
211 }
else if (recognition_rotation == 2) {
212 rotation_ = rotation180;
213 }
else if (recognition_rotation == 3) {
214 rotation_ = clockwise90;
220 if (recognition_rotation & 1) {
221 vertical_text_lines = !vertical_text_lines;
227 if (vertical_text_lines) {
228 rotation_.
rotate(anticlockwise90);
229 text_rotation_.
rotate(clockwise90);
232 rerotate_ =
FCOORD(rotation_.
x(), -rotation_.
y());
233 if (rotation_.
x() != 1.0f || rotation_.
y() != 0.0f) {
249 tprintf(
"Vertical=%d, orientation=%d, final rotation=(%f, %f)+(%f,%f)\n",
250 vertical_text_lines, recognition_rotation,
251 rotation_.
x(), rotation_.
y(),
252 text_rotation_.
x(), text_rotation_.
y());
258 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f);
286 int scaled_factor,
TO_BLOCK* input_block,
287 Pix* photo_mask_pix, Pix* thresholds_pix,
289 BLOCK_LIST* blocks, BLOBNBOX_LIST* diacritic_blobs,
290 TO_BLOCK_LIST* to_blocks) {
291 pixOr(photo_mask_pix, photo_mask_pix, nontext_map_);
298 pageseg_mode, rerotate_, input_block, nontext_map_, denorm_, cjk_script_,
299 &projection_, diacritic_blobs, &part_grid_, &big_parts_);
302 input_block,
this, pixa_debug, &part_grid_,
307 input_block,
this, pixa_debug, &part_grid_,
314 ColPartition_IT p_it(&big_parts_);
315 for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward())
316 p_it.data()->DisownBoxesNoAssert();
318 delete stroke_width_;
319 stroke_width_ =
nullptr;
345 ReflectForRtl(input_block, &image_bblobs_);
358 min_gutter_width_, tabfind_aligned_gap_fraction_,
359 &part_grid_, &deskew_, &reskew_);
361 auto* new_denorm =
new DENORM;
363 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f);
364 denorm_ = new_denorm;
370 if (!MakeColumns(
false)) {
379 #ifndef GRAPHICS_DISABLED
380 if (textord_tabfind_show_reject_blobs) {
384 #endif // GRAPHICS_DISABLED
390 GridSplitPartitions();
394 GridMergePartitions();
397 InsertRemainingNoise(input_block);
399 GridInsertHLinePartitions();
400 GridInsertVLinePartitions();
405 if (textord_tabfind_show_initial_partitions) {
412 if (equation_detect_) {
415 if (textord_tabfind_find_tables) {
427 GridRemoveUnderlinePartitions();
437 #ifndef GRAPHICS_DISABLED
438 if (textord_tabfind_show_partitions) {
440 if (window !=
nullptr) {
444 if (window !=
nullptr && textord_tabfind_show_partitions > 1) {
449 #endif // GRAPHICS_DISABLED
455 ReleaseBlobsAndCleanupUnused(input_block);
463 TransformToBlocks(blocks, to_blocks);
465 tprintf(
"Found %d blocks, %d to_blocks\n",
466 blocks->length(), to_blocks->length());
469 DisplayBlocks(blocks);
470 RotateAndReskewBlocks(input_is_rtl, to_blocks);
472 #ifndef GRAPHICS_DISABLED
473 if (blocks_win_ !=
nullptr) {
474 bool waiting =
false;
478 if (event->type ==
SVET_INPUT && event->parameter !=
nullptr) {
479 if (*event->parameter ==
'd')
484 blocks_win_ =
nullptr;
491 #endif // GRAPHICS_DISABLED
499 deskew->
set_y(-deskew->
y());
503 equation_detect_ = detect;
509 void ColumnFinder::DisplayBlocks(BLOCK_LIST* blocks) {
510 #ifndef GRAPHICS_DISABLED
511 if (textord_tabfind_show_blocks) {
512 if (blocks_win_ ==
nullptr)
515 blocks_win_->
Clear();
517 BLOCK_IT block_it(blocks);
519 for (block_it.mark_cycle_pt(); !block_it.cycled_list();
520 block_it.forward()) {
521 BLOCK* block = block_it.data();
533 void ColumnFinder::DisplayColumnBounds(
PartSetVector* sets) {
534 #ifndef GRAPHICS_DISABLED
539 ColPartitionSet* columns = best_columns_[i];
540 if (columns !=
nullptr)
548 bool ColumnFinder::MakeColumns(
bool single_column) {
553 if (!single_column) {
558 bool good_only =
true;
561 ColPartitionSet* line_set = part_sets.get(i);
562 if (line_set !=
nullptr && line_set->LegalColumnCandidate()) {
563 ColPartitionSet* column_candidate = line_set->Copy(good_only);
564 if (column_candidate !=
nullptr)
565 column_candidate->AddToColumnSetsIfUnique(&column_sets_,
WidthCB());
568 good_only = !good_only;
569 }
while (column_sets_.
empty() && !good_only);
571 PrintColumnCandidates(
"Column candidates");
573 ImproveColumnCandidates(&column_sets_, &column_sets_);
575 PrintColumnCandidates(
"Improved columns");
577 ImproveColumnCandidates(&part_sets, &column_sets_);
579 ColPartitionSet* single_column_set =
581 if (single_column_set !=
nullptr) {
587 PrintColumnCandidates(
"Final Columns");
588 bool has_columns = !column_sets_.
empty();
591 bool any_multi_column = AssignColumns(part_sets);
592 if (textord_tabfind_show_columns) {
593 DisplayColumnBounds(&part_sets);
595 ComputeMeanColumnGap(any_multi_column);
597 for (
int i = 0; i < part_sets.size(); ++i) {
598 ColPartitionSet* line_set = part_sets.get(i);
599 if (line_set !=
nullptr) {
600 line_set->RelinquishParts();
611 void ColumnFinder::ImproveColumnCandidates(
PartSetVector* src_sets,
614 temp_cols.
move(column_sets);
615 if (src_sets == column_sets)
616 src_sets = &temp_cols;
617 int set_size = temp_cols.size();
619 bool good_only =
true;
621 for (
int i = 0; i < set_size; ++i) {
622 ColPartitionSet* column_candidate = temp_cols.get(i);
624 ColPartitionSet* improved = column_candidate->Copy(good_only);
625 if (improved !=
nullptr) {
626 improved->ImproveColumnCandidate(
WidthCB(), src_sets);
627 improved->AddToColumnSetsIfUnique(column_sets,
WidthCB());
630 good_only = !good_only;
631 }
while (column_sets->empty() && !good_only);
632 if (column_sets->empty())
633 column_sets->move(&temp_cols);
635 temp_cols.delete_data_pointers();
639 void ColumnFinder::PrintColumnCandidates(
const char* title) {
640 int set_size = column_sets_.
size();
641 tprintf(
"Found %d %s:\n", set_size, title);
643 for (
int i = 0; i < set_size; ++i) {
644 ColPartitionSet* column_set = column_sets_.
get(i);
659 bool ColumnFinder::AssignColumns(
const PartSetVector& part_sets) {
660 int set_count = part_sets.size();
663 best_columns_ =
new ColPartitionSet*[set_count];
664 for (
int y = 0; y < set_count; ++y)
665 best_columns_[y] =
nullptr;
666 int column_count = column_sets_.
size();
676 bool* any_columns_possible =
new bool[set_count];
677 int* assigned_costs =
new int[set_count];
678 int** column_set_costs =
new int*[set_count];
681 for (
int part_i = 0; part_i < set_count; ++part_i) {
682 ColPartitionSet* line_set = part_sets.get(part_i);
683 bool debug = line_set !=
nullptr &&
685 line_set->bounding_box().bottom());
686 column_set_costs[part_i] =
new int[column_count];
687 any_columns_possible[part_i] =
false;
688 assigned_costs[part_i] = INT32_MAX;
689 for (
int col_i = 0; col_i < column_count; ++col_i) {
690 if (line_set !=
nullptr &&
691 column_sets_.
get(col_i)->CompatibleColumns(debug, line_set,
693 column_set_costs[part_i][col_i] =
694 column_sets_.
get(col_i)->UnmatchedWidth(line_set);
695 any_columns_possible[part_i] =
true;
697 column_set_costs[part_i][col_i] = INT32_MAX;
699 tprintf(
"Set id %d did not match at y=%d, lineset =%p\n",
700 col_i, part_i, line_set);
704 bool any_multi_column =
false;
708 while (BiggestUnassignedRange(set_count, any_columns_possible,
711 tprintf(
"Biggest unassigned range = %d- %d\n", start, end);
713 int column_set_id = RangeModalColumnSet(column_set_costs,
714 assigned_costs, start, end);
716 tprintf(
"Range modal column id = %d\n", column_set_id);
717 column_sets_.
get(column_set_id)->Print();
720 ShrinkRangeToLongestRun(column_set_costs, assigned_costs,
721 any_columns_possible,
722 column_set_id, &start, &end);
724 tprintf(
"Shrunk range = %d- %d\n", start, end);
728 ExtendRangePastSmallGaps(column_set_costs, assigned_costs,
729 any_columns_possible,
730 column_set_id, -1, -1, &start);
732 ExtendRangePastSmallGaps(column_set_costs, assigned_costs,
733 any_columns_possible,
734 column_set_id, 1, set_count, &end);
737 tprintf(
"Column id %d applies to range = %d - %d\n",
738 column_set_id, start, end);
740 AssignColumnToRange(column_set_id, start, end, column_set_costs,
742 if (column_sets_.
get(column_set_id)->GoodColumnCount() > 1)
743 any_multi_column =
true;
747 if (best_columns_[0] ==
nullptr) {
748 AssignColumnToRange(0, 0,
gridheight_, column_set_costs, assigned_costs);
751 for (
int i = 0; i < set_count; ++i) {
752 delete [] column_set_costs[i];
754 delete [] assigned_costs;
755 delete [] any_columns_possible;
756 delete [] column_set_costs;
757 return any_multi_column;
762 bool ColumnFinder::BiggestUnassignedRange(
int set_count,
763 const bool* any_columns_possible,
764 int* best_start,
int* best_end) {
765 int best_range_size = 0;
766 *best_start = set_count;
767 *best_end = set_count;
769 for (
int start = 0; start <
gridheight_; start = end) {
771 while (start < set_count) {
772 if (best_columns_[start] ==
nullptr && any_columns_possible[start])
779 while (end < set_count) {
780 if (best_columns_[end] !=
nullptr)
782 if (any_columns_possible[end])
786 if (start < set_count && range_size > best_range_size) {
787 best_range_size = range_size;
792 return *best_start < *best_end;
796 int ColumnFinder::RangeModalColumnSet(
int** column_set_costs,
797 const int* assigned_costs,
798 int start,
int end) {
799 int column_count = column_sets_.
size();
800 STATS column_stats(0, column_count);
801 for (
int part_i = start; part_i < end; ++part_i) {
802 for (
int col_j = 0; col_j < column_count; ++col_j) {
803 if (column_set_costs[part_i][col_j] < assigned_costs[part_i])
804 column_stats.add(col_j, 1);
808 return column_stats.mode();
815 void ColumnFinder::ShrinkRangeToLongestRun(
int** column_set_costs,
816 const int* assigned_costs,
817 const bool* any_columns_possible,
819 int* best_start,
int* best_end) {
821 int orig_start = *best_start;
822 int orig_end = *best_end;
823 int best_range_size = 0;
824 *best_start = orig_end;
825 *best_end = orig_end;
827 for (
int start = orig_start; start < orig_end; start = end) {
829 while (start < orig_end) {
830 if (column_set_costs[start][column_set_id] < assigned_costs[start] ||
831 !any_columns_possible[start])
837 while (end < orig_end) {
838 if (column_set_costs[end][column_set_id] >= assigned_costs[start] &&
839 any_columns_possible[end])
843 if (start < orig_end && end - start > best_range_size) {
844 best_range_size = end - start;
854 void ColumnFinder::ExtendRangePastSmallGaps(
int** column_set_costs,
855 const int* assigned_costs,
856 const bool* any_columns_possible,
858 int step,
int end,
int* start) {
860 tprintf(
"Starting expansion at %d, step=%d, limit=%d\n",
865 int barrier_size = 0;
871 for (i = *start + step; i != end; i += step) {
872 if (column_set_costs[i][column_set_id] < assigned_costs[i])
875 if (any_columns_possible[i])
879 tprintf(
"At %d, Barrier size=%d\n", i, barrier_size);
889 for (i += step; i != end; i += step) {
890 if (column_set_costs[i][column_set_id] < assigned_costs[i])
892 else if (any_columns_possible[i])
896 tprintf(
"At %d, good size = %d\n", i, good_size);
898 if (good_size >= barrier_size)
900 }
while (good_size >= barrier_size);
904 void ColumnFinder::AssignColumnToRange(
int column_set_id,
int start,
int end,
905 int** column_set_costs,
906 int* assigned_costs) {
907 ColPartitionSet* column_set = column_sets_.
get(column_set_id);
908 for (
int i = start; i < end; ++i) {
909 assigned_costs[i] = column_set_costs[i][column_set_id];
910 best_columns_[i] = column_set;
915 void ColumnFinder::ComputeMeanColumnGap(
bool any_multi_column) {
919 int width_samples = 0;
927 mean_column_gap_ = any_multi_column && gap_samples > 0
928 ? total_gap / gap_samples : width_samples > 0
929 ? total_width / width_samples : 0;
937 static void ReleaseAllBlobsAndDeleteUnused(BLOBNBOX_LIST* blobs) {
938 for (BLOBNBOX_IT blob_it(blobs); !blob_it.empty(); blob_it.forward()) {
940 if (blob->
owner() ==
nullptr) {
941 delete blob->
cblob();
950 void ColumnFinder::ReleaseBlobsAndCleanupUnused(
TO_BLOCK* block) {
951 ReleaseAllBlobsAndDeleteUnused(&block->
blobs);
952 ReleaseAllBlobsAndDeleteUnused(&block->
small_blobs);
953 ReleaseAllBlobsAndDeleteUnused(&block->
noise_blobs);
954 ReleaseAllBlobsAndDeleteUnused(&block->
large_blobs);
955 ReleaseAllBlobsAndDeleteUnused(&image_bblobs_);
959 void ColumnFinder::GridSplitPartitions() {
961 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
962 gsearch(&part_grid_);
963 gsearch.StartFullSearch();
964 ColPartition* dont_repeat =
nullptr;
966 while ((part = gsearch.NextFullSearch()) !=
nullptr) {
967 if (part->blob_type() <
BRT_UNKNOWN || part == dont_repeat)
969 ColPartitionSet* column_set = best_columns_[gsearch.GridY()];
973 part->ColumnRange(
resolution_, column_set, &first_col, &last_col);
982 if (last_col != first_col + 1)
985 int y = part->MidY();
986 TBOX margin_box = part->bounding_box();
990 tprintf(
"Considering partition for GridSplit:");
993 ColPartition* column = column_set->GetColumnByIndex(first_col);
994 if (column ==
nullptr)
996 margin_box.
set_left(column->RightAtY(y) + 2);
997 column = column_set->GetColumnByIndex(last_col);
998 if (column ==
nullptr)
1000 margin_box.
set_right(column->LeftAtY(y) - 2);
1004 GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> rectsearch(
this);
1006 tprintf(
"Searching box (%d,%d)->(%d,%d)\n",
1008 margin_box.
right(), margin_box.
top());
1011 rectsearch.StartRectSearch(margin_box);
1013 while ((bbox = rectsearch.NextRectSearch()) !=
nullptr) {
1017 if (bbox ==
nullptr) {
1019 gsearch.RemoveBBox();
1020 int x_middle = (margin_box.
left() + margin_box.
right()) / 2;
1022 tprintf(
"Splitting part at %d:", x_middle);
1025 ColPartition* split_part = part->SplitAt(x_middle);
1026 if (split_part !=
nullptr) {
1030 split_part->Print();
1032 part_grid_.
InsertBBox(
true,
true, split_part);
1036 tprintf(
"Split had no effect\n");
1040 gsearch.RepositionIterator();
1042 tprintf(
"Part cannot be split: blob (%d,%d)->(%d,%d) in column gap\n",
1051 void ColumnFinder::GridMergePartitions() {
1053 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1054 gsearch(&part_grid_);
1055 gsearch.StartFullSearch();
1057 while ((part = gsearch.NextFullSearch()) !=
nullptr) {
1058 if (part->IsUnMergeableType())
1061 ColPartitionSet* columns = best_columns_[gsearch.GridY()];
1062 TBOX box = part->bounding_box();
1065 tprintf(
"Considering part for merge at:");
1068 int y = part->MidY();
1069 ColPartition* left_column = columns->ColumnContaining(box.
left(), y);
1070 ColPartition* right_column = columns->ColumnContaining(box.
right(), y);
1071 if (left_column ==
nullptr || right_column != left_column) {
1073 tprintf(
"In different columns\n");
1076 box.
set_left(left_column->LeftAtY(y));
1077 box.
set_right(right_column->RightAtY(y));
1079 bool modified_box =
false;
1080 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1081 rsearch(&part_grid_);
1082 rsearch.SetUniqueMode(
true);
1083 rsearch.StartRectSearch(box);
1084 ColPartition* neighbour;
1086 while ((neighbour = rsearch.NextRectSearch()) !=
nullptr) {
1087 if (neighbour == part || neighbour->IsUnMergeableType())
1089 const TBOX& neighbour_box = neighbour->bounding_box();
1091 tprintf(
"Considering merge with neighbour at:");
1094 if (neighbour_box.
right() < box.
left() ||
1097 if (part->VSignificantCoreOverlap(*neighbour) &&
1098 part->TypesMatch(*neighbour)) {
1104 const TBOX& part_box = part->bounding_box();
1107 if (neighbour_box.
left() > part->right_margin() &&
1108 part_box.
right() < neighbour->left_margin())
1110 if (neighbour_box.
right() < part->left_margin() &&
1111 part_box.
left() > neighbour->right_margin())
1113 int h_gap = std::max(part_box.
left(), neighbour_box.
left()) -
1114 std::min(part_box.
right(), neighbour_box.
right());
1116 part_box.
width() < mean_column_gap_ ||
1117 neighbour_box.
width() < mean_column_gap_) {
1119 tprintf(
"Running grid-based merge between:\n");
1123 rsearch.RemoveBBox();
1124 if (!modified_box) {
1126 gsearch.RemoveBBox();
1127 rsearch.RepositionIterator();
1128 modified_box =
true;
1130 part->Absorb(neighbour,
WidthCB());
1132 tprintf(
"Neighbour failed hgap test\n");
1135 tprintf(
"Neighbour failed overlap or typesmatch test\n");
1146 gsearch.RepositionIterator();
1153 void ColumnFinder::InsertRemainingNoise(
TO_BLOCK* block) {
1155 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1157 if (blob->
owner() !=
nullptr)
continue;
1163 rsearch.SetUniqueMode(
true);
1164 rsearch.StartRectSearch(search_box);
1166 ColPartition* best_part =
nullptr;
1167 int best_distance = 0;
1168 while ((part = rsearch.NextRectSearch()) !=
nullptr) {
1169 if (part->IsUnMergeableType())
1173 if (best_part ==
nullptr ||
distance < best_distance) {
1178 if (best_part !=
nullptr &&
1179 best_distance < kMaxDistToPartSizeRatio * best_part->median_height()) {
1182 tprintf(
"Adding noise blob with distance %d, thr=%g:box:",
1190 best_part->AddBox(blob);
1191 part_grid_.
InsertBBox(
true,
true, best_part);
1205 static TBOX BoxFromHLine(
const TabVector* hline) {
1206 int top = std::max(hline->startpt().y(), hline->endpt().y());
1207 int bottom = std::min(hline->startpt().y(), hline->endpt().y());
1208 top += hline->mean_width();
1209 if (top == bottom) {
1215 return TBOX(hline->startpt().x(), bottom, hline->endpt().x(), top);
1220 void ColumnFinder::GridRemoveUnderlinePartitions() {
1221 TabVector_IT hline_it(&horizontal_lines_);
1222 for (hline_it.mark_cycle_pt(); !hline_it.cycled_list(); hline_it.forward()) {
1223 TabVector* hline = hline_it.data();
1224 if (hline->intersects_other_lines())
1226 TBOX line_box = BoxFromHLine(hline);
1227 TBOX search_box = line_box;
1230 part_search.SetUniqueMode(
true);
1231 part_search.StartRectSearch(search_box);
1232 ColPartition* covered;
1233 bool touched_table =
false;
1234 bool touched_text =
false;
1235 ColPartition* line_part =
nullptr;
1236 while ((covered = part_search.NextRectSearch()) !=
nullptr) {
1238 touched_table =
true;
1240 }
else if (covered->IsTextType()) {
1242 int text_bottom = covered->median_bottom();
1243 if (line_box.
bottom() <= text_bottom && text_bottom <= search_box.
top())
1244 touched_text =
true;
1245 }
else if (covered->blob_type() ==
BRT_HLINE &&
1246 line_box.
contains(covered->bounding_box())) {
1247 line_part = covered;
1250 if (line_part !=
nullptr && !touched_table && touched_text) {
1258 void ColumnFinder::GridInsertHLinePartitions() {
1259 TabVector_IT hline_it(&horizontal_lines_);
1260 for (hline_it.mark_cycle_pt(); !hline_it.cycled_list(); hline_it.forward()) {
1261 TabVector* hline = hline_it.data();
1262 TBOX line_box = BoxFromHLine(hline);
1267 bool any_image =
false;
1269 part_search.SetUniqueMode(
true);
1270 part_search.StartRectSearch(line_box);
1271 ColPartition* covered;
1272 while ((covered = part_search.NextRectSearch()) !=
nullptr) {
1273 if (covered->IsImageType()) {
1286 void ColumnFinder::GridInsertVLinePartitions() {
1288 for (vline_it.mark_cycle_pt(); !vline_it.cycled_list(); vline_it.forward()) {
1289 TabVector* vline = vline_it.data();
1290 if (!vline->IsSeparator())
1292 int left = std::min(vline->startpt().x(), vline->endpt().x());
1293 int right = std::max(vline->startpt().x(), vline->endpt().x());
1294 right += vline->mean_width();
1295 if (left == right) {
1303 left, vline->startpt().
y(), right, vline->endpt().y());
1305 bool any_image =
false;
1307 part_search.SetUniqueMode(
true);
1308 part_search.StartRectSearch(part->bounding_box());
1309 ColPartition* covered;
1310 while ((covered = part_search.NextRectSearch()) !=
nullptr) {
1311 if (covered->IsImageType()) {
1325 void ColumnFinder::SetPartitionTypes() {
1326 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1327 gsearch(&part_grid_);
1328 gsearch.StartFullSearch();
1330 while ((part = gsearch.NextFullSearch()) !=
nullptr) {
1331 part->SetPartitionType(
resolution_, best_columns_[gsearch.GridY()]);
1337 void ColumnFinder::SmoothPartnerRuns() {
1339 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1340 gsearch(&part_grid_);
1341 gsearch.StartFullSearch();
1343 while ((part = gsearch.NextFullSearch()) !=
nullptr) {
1344 ColPartition* partner = part->SingletonPartner(
true);
1345 if (partner !=
nullptr) {
1346 if (partner->SingletonPartner(
false) != part) {
1347 tprintf(
"Ooops! Partition:(%d partners)",
1348 part->upper_partners()->length());
1350 tprintf(
"has singleton partner:(%d partners",
1351 partner->lower_partners()->length());
1353 tprintf(
"but its singleton partner is:");
1354 if (partner->SingletonPartner(
false) ==
nullptr)
1357 partner->SingletonPartner(
false)->Print();
1359 ASSERT_HOST(partner->SingletonPartner(
false) == part);
1360 }
else if (part->SingletonPartner(
false) !=
nullptr) {
1361 ColPartitionSet* column_set = best_columns_[gsearch.GridY()];
1363 part->SmoothPartnerRun(column_count * 2 + 1);
1370 void ColumnFinder::AddToTempPartList(ColPartition* part,
1371 ColPartition_CLIST* temp_list) {
1372 int mid_y = part->MidY();
1373 ColPartition_C_IT it(temp_list);
1374 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1375 ColPartition* test_part = it.data();
1378 if (test_part == part->SingletonPartner(
false))
1380 int neighbour_bottom = test_part->median_bottom();
1381 int neighbour_top = test_part->median_top();
1382 int neighbour_y = (neighbour_bottom + neighbour_top) / 2;
1383 if (neighbour_y < mid_y)
1385 if (!part->HOverlaps(*test_part) && !part->WithinSameMargins(*test_part))
1388 if (it.cycled_list()) {
1389 it.add_to_end(part);
1391 it.add_before_stay_put(part);
1396 void ColumnFinder::EmptyTempPartList(ColPartition_CLIST* temp_list,
1397 WorkingPartSet_LIST* work_set) {
1398 ColPartition_C_IT it(temp_list);
1399 while (!it.empty()) {
1401 &good_parts_, work_set);
1407 void ColumnFinder::TransformToBlocks(BLOCK_LIST* blocks,
1408 TO_BLOCK_LIST* to_blocks) {
1409 WorkingPartSet_LIST work_set;
1410 ColPartitionSet* column_set =
nullptr;
1411 ColPartition_IT noise_it(&noise_parts_);
1415 ColPartition_CLIST temp_part_list;
1417 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1418 gsearch(&part_grid_);
1419 gsearch.StartFullSearch();
1420 int prev_grid_y = -1;
1422 while ((part = gsearch.NextFullSearch()) !=
nullptr) {
1423 int grid_y = gsearch.GridY();
1424 if (grid_y != prev_grid_y) {
1425 EmptyTempPartList(&temp_part_list, &work_set);
1426 prev_grid_y = grid_y;
1428 if (best_columns_[grid_y] != column_set) {
1429 column_set = best_columns_[grid_y];
1433 &good_parts_, &work_set);
1435 tprintf(
"Changed column groups at grid index %d, y=%d\n",
1436 gsearch.GridY(), gsearch.GridY() *
gridsize());
1439 noise_it.add_to_end(part);
1441 AddToTempPartList(part, &temp_part_list);
1444 EmptyTempPartList(&temp_part_list, &work_set);
1446 WorkingPartSet_IT work_it(&work_set);
1447 while (!work_it.empty()) {
1448 WorkingPartSet* working_set = work_it.extract();
1450 &good_parts_, blocks, to_blocks);
1458 static void ReflectBlobList(BLOBNBOX_LIST* bblobs) {
1459 BLOBNBOX_IT it(bblobs);
1460 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1461 it.data()->reflect_box_in_y_axis();
1471 void ColumnFinder::ReflectForRtl(
TO_BLOCK* input_block, BLOBNBOX_LIST* bblobs) {
1472 ReflectBlobList(bblobs);
1473 ReflectBlobList(&input_block->
blobs);
1478 auto* new_denorm =
new DENORM;
1480 0.0f, 0.0f, -1.0f, 1.0f, 0.0f, 0.0f);
1481 denorm_ = new_denorm;
1487 static void RotateAndExplodeBlobList(
const FCOORD& blob_rotation,
1488 BLOBNBOX_LIST* bblobs,
1491 BLOBNBOX_IT it(bblobs);
1492 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1495 C_OUTLINE_LIST* outlines = cblob->
out_list();
1496 C_OUTLINE_IT ol_it(outlines);
1497 if (!outlines->singleton()) {
1500 for (;!ol_it.empty(); ol_it.forward()) {
1506 it.add_after_stay_put(new_blob);
1512 if (blob_rotation.
x() != 1.0f || blob_rotation.
y() != 0.0f) {
1513 cblob->
rotate(blob_rotation);
1534 void ColumnFinder::RotateAndReskewBlocks(
bool input_is_rtl,
1535 TO_BLOCK_LIST* blocks) {
1542 TO_BLOCK_IT it(blocks);
1543 int block_index = 1;
1544 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1558 FCOORD blob_rotation = ComputeBlockAndClassifyRotation(block);
1563 RotateAndExplodeBlobList(blob_rotation, &to_block->
blobs,
1565 TO_ROW_IT row_it(to_block->
get_rows());
1566 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1567 TO_ROW* row = row_it.data();
1568 RotateAndExplodeBlobList(blob_rotation, row->
blob_list(),
1572 static_cast<int>(heights.
median() + 0.5));
1574 tprintf(
"Block median size = (%d, %d)\n",
1584 FCOORD ColumnFinder::ComputeBlockAndClassifyRotation(
BLOCK* block) {
1593 FCOORD classify_rotation(text_rotation_);
1594 FCOORD block_rotation(1.0f, 0.0f);
1600 if (rerotate_.
x() == 0.0f)
1601 block_rotation = rerotate_;
1603 block_rotation =
FCOORD(0.0f, -1.0f);
1604 block->
rotate(block_rotation);
1605 classify_rotation =
FCOORD(1.0f, 0.0f);
1607 block_rotation.rotate(rotation_);
1611 FCOORD blob_rotation(block_rotation);
1612 block_rotation.set_y(-block_rotation.y());
1616 tprintf(
"Blk %d, type %d rerotation(%.2f, %.2f), char(%.2f,%.2f), box:",
1619 classify_rotation.x(), classify_rotation.y());
1622 return blob_rotation;