20 #include "config_auto.h"
40 static INT_VAR(textord_tabfind_show_strokewidths, 0,
"Show stroke widths");
41 static BOOL_VAR(textord_tabfind_only_strokewidths,
false,
"Only run stroke widths");
111 :
BlobGrid(gridsize, bleft, tright), nontext_map_(nullptr), projection_(nullptr),
112 denorm_(nullptr), grid_box_(bleft, tright), rerotation_(1.0f, 0.0f) {
113 leaders_win_ =
nullptr;
114 widths_win_ =
nullptr;
115 initial_widths_win_ =
nullptr;
116 chains_win_ =
nullptr;
117 diacritics_win_ =
nullptr;
118 textlines_win_ =
nullptr;
119 smoothed_win_ =
nullptr;
123 if (widths_win_ !=
nullptr) {
124 #ifndef GRAPHICS_DISABLED
126 #endif // GRAPHICS_DISABLED
127 if (textord_tabfind_only_strokewidths)
132 delete initial_widths_win_;
134 delete textlines_win_;
135 delete smoothed_win_;
136 delete diacritics_win_;
145 BLOBNBOX_IT blob_it(&block->
blobs);
146 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
147 SetNeighbours(
false,
false, blob_it.data());
160 InsertBlobs(input_block);
162 while (cjk_merge && FixBrokenCJK(input_block));
164 FindTextlineFlowDirection(pageseg_mode,
false);
170 static void CollectHorizVertBlobs(BLOBNBOX_LIST* input_blobs,
171 int* num_vertical_blobs,
172 int* num_horizontal_blobs,
173 BLOBNBOX_CLIST* vertical_blobs,
174 BLOBNBOX_CLIST* horizontal_blobs,
175 BLOBNBOX_CLIST* nondescript_blobs) {
176 BLOBNBOX_C_IT v_it(vertical_blobs);
177 BLOBNBOX_C_IT h_it(horizontal_blobs);
178 BLOBNBOX_C_IT n_it(nondescript_blobs);
179 BLOBNBOX_IT blob_it(input_blobs);
180 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
183 float y_x = static_cast<float>(box.
height()) / box.
width();
184 float x_y = 1.0f / y_x;
186 float ratio = x_y > y_x ? x_y : y_x;
190 ++*num_vertical_blobs;
191 if (ok_blob) v_it.add_after_then_move(blob);
193 ++*num_horizontal_blobs;
194 if (ok_blob) h_it.add_after_then_move(blob);
195 }
else if (ok_blob) {
196 n_it.add_after_then_move(blob);
210 BLOBNBOX_CLIST* osd_blobs) {
211 int vertical_boxes = 0;
212 int horizontal_boxes = 0;
214 BLOBNBOX_CLIST vertical_blobs;
215 BLOBNBOX_CLIST horizontal_blobs;
216 BLOBNBOX_CLIST nondescript_blobs;
217 CollectHorizVertBlobs(&block->
blobs, &vertical_boxes, &horizontal_boxes,
218 &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
219 CollectHorizVertBlobs(&block->
large_blobs, &vertical_boxes, &horizontal_boxes,
220 &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
222 tprintf(
"TextDir hbox=%d vs vbox=%d, %dH, %dV, %dN osd blobs\n",
223 horizontal_boxes, vertical_boxes,
224 horizontal_blobs.length(), vertical_blobs.length(),
225 nondescript_blobs.length());
226 if (osd_blobs !=
nullptr && vertical_boxes == 0 && horizontal_boxes == 0) {
228 BLOBNBOX_C_IT osd_it(osd_blobs);
229 osd_it.add_list_after(&nondescript_blobs);
232 int min_vert_boxes = static_cast<int>((vertical_boxes + horizontal_boxes) *
233 find_vertical_text_ratio);
234 if (vertical_boxes >= min_vert_boxes) {
235 if (osd_blobs !=
nullptr) {
236 BLOBNBOX_C_IT osd_it(osd_blobs);
237 osd_it.add_list_after(&vertical_blobs);
241 if (osd_blobs !=
nullptr) {
242 BLOBNBOX_C_IT osd_it(osd_blobs);
243 osd_it.add_list_after(&horizontal_blobs);
254 rerotation_.
set_x(rotation.
x());
255 rerotation_.
set_y(-rotation.
y());
263 ColPartition_LIST leader_parts;
264 FindLeadersAndMarkNoise(block, &leader_parts);
268 for (ColPartition_IT it(&leader_parts); !it.empty(); it.forward()) {
271 MarkLeaderNeighbours(part,
LR_LEFT);
272 MarkLeaderNeighbours(part,
LR_RIGHT);
293 TBOX search_box = box;
294 search_box.
pad(padding, padding);
301 rsearch.StartRectSearch(search_box);
302 while ((n = rsearch.NextRectSearch()) !=
nullptr) {
303 if (n == bbox)
continue;
305 if (nbox.
height() > max_height) {
306 max_height = nbox.
height();
310 tprintf(
"Max neighbour size=%d for candidate line box at:", max_height);
314 #ifndef GRAPHICS_DISABLED
315 if (leaders_win_ !=
nullptr) {
322 #endif // GRAPHICS_DISABLED
345 Pix* nontext_pix,
const DENORM* denorm,
bool cjk_script,
348 nontext_map_ = nontext_pix;
349 projection_ = projection;
360 FindTextlineFlowDirection(pageseg_mode,
false);
362 if (textord_tabfind_show_strokewidths) {
373 FindTextlineFlowDirection(pageseg_mode,
true);
375 FindInitialPartitions(pageseg_mode, rerotation,
true, block,
376 diacritic_blobs, part_grid, big_parts, &skew);
378 tprintf(
"Detected %d diacritics\n", diacritic_blobs->length());
382 FindTextlineFlowDirection(pageseg_mode,
true);
383 r = FindInitialPartitions(pageseg_mode, rerotation,
false, block,
384 diacritic_blobs, part_grid, big_parts, &skew);
386 nontext_map_ =
nullptr;
387 projection_ =
nullptr;
391 static void PrintBoxWidths(
BLOBNBOX* neighbour) {
393 tprintf(
"Box (%d,%d)->(%d,%d): h-width=%.1f, v-width=%.1f p-width=%1.f\n",
406 FCOORD click(static_cast<float>(x), static_cast<float>(y));
410 PrintBoxWidths(neighbour);
421 tprintf(
"Left gap=%d, right=%d, above=%d, below=%d, horz=%d, vert=%d\n"
422 "Good= %d %d %d %d\n",
445 void StrokeWidth::FindLeadersAndMarkNoise(
TO_BLOCK* block,
446 ColPartition_LIST* leader_parts) {
452 gsearch.StartFullSearch();
453 while ((bbox = gsearch.NextFullSearch()) !=
nullptr) {
454 SetNeighbours(
true,
false, bbox);
456 ColPartition_IT part_it(leader_parts);
457 gsearch.StartFullSearch();
458 while ((bbox = gsearch.NextFullSearch()) !=
nullptr) {
466 for (blob = bbox; blob !=
nullptr && blob->
flow() ==
BTFT_NONE;
473 if (part->MarkAsLeaderIfMonospaced())
474 part_it.add_after_then_move(part);
479 if (textord_tabfind_show_strokewidths) {
480 leaders_win_ = DisplayGoodBlobs(
"LeaderNeighbours", 0, 0);
484 BLOBNBOX_IT blob_it(&block->
blobs);
486 for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
492 blob_it.add_to_end(small_it.extract());
499 for (noise_it.mark_cycle_pt(); !noise_it.cycled_list(); noise_it.forward()) {
502 small_it.add_to_end(noise_it.extract());
514 void StrokeWidth::InsertBlobs(
TO_BLOCK* block) {
522 void StrokeWidth::MarkLeaderNeighbours(
const ColPartition* part,
524 const TBOX& part_box = part->bounding_box();
529 blobsearch.StartSideSearch(side ==
LR_LEFT ? part_box.
left()
533 while ((blob = blobsearch.NextSideSearch(side ==
LR_LEFT)) !=
nullptr) {
537 int x_gap = blob_box.
x_gap(part_box);
540 }
else if (best_blob ==
nullptr || x_gap < best_gap) {
545 if (best_blob !=
nullptr) {
550 #ifndef GRAPHICS_DISABLED
551 if (leaders_win_ !=
nullptr) {
557 #endif // GRAPHICS_DISABLED
562 static int UpperQuartileCJKSize(
int gridsize, BLOBNBOX_LIST* blobs) {
564 BLOBNBOX_IT it(blobs);
565 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
570 sizes.add(height, 1);
572 return static_cast<int>(sizes.ile(0.75f) + 0.5);
580 bool StrokeWidth::FixBrokenCJK(
TO_BLOCK* block) {
581 BLOBNBOX_LIST* blobs = &block->
blobs;
582 int median_height = UpperQuartileCJKSize(
gridsize(), blobs);
586 BLOBNBOX_IT blob_it(blobs);
588 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
596 tprintf(
"Checking for Broken CJK (max size=%d):", max_height);
600 BLOBNBOX_CLIST overlapped_blobs;
601 AccumulateOverlaps(blob, debug, max_height, max_dist,
602 &bbox, &overlapped_blobs);
603 if (!overlapped_blobs.empty()) {
610 tprintf(
"Bad final aspectratio:");
618 tprintf(
"Too many neighbours: %d\n", overlapped_blobs.length());
622 BLOBNBOX_C_IT n_it(&overlapped_blobs);
623 for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
625 neighbour = n_it.data();
630 if (!n_it.cycled_list()) {
633 PrintBoxWidths(blob);
643 for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
649 if (rerotation_.
x() != 1.0f || rerotation_.
y() != 0.0f) {
662 int num_remaining = 0;
663 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
680 static bool AcceptableCJKMerge(
const TBOX& bbox,
const TBOX& nbox,
681 bool debug,
int max_size,
int max_dist,
682 int* x_gap,
int* y_gap) {
683 *x_gap = bbox.
x_gap(nbox);
684 *y_gap = bbox.
y_gap(nbox);
688 tprintf(
"gaps = %d, %d, merged_box:", *x_gap, *y_gap);
691 if (*x_gap <= max_dist && *y_gap <= max_dist &&
692 merged.width() <= max_size && merged.height() <= max_size) {
694 double old_ratio = static_cast<double>(bbox.
width()) / bbox.
height();
695 if (old_ratio < 1.0) old_ratio = 1.0 / old_ratio;
696 double new_ratio = static_cast<double>(merged.width()) / merged.height();
697 if (new_ratio < 1.0) new_ratio = 1.0 / new_ratio;
708 void StrokeWidth::AccumulateOverlaps(
const BLOBNBOX* not_this,
bool debug,
709 int max_size,
int max_dist,
710 TBOX* bbox, BLOBNBOX_CLIST* blobs) {
716 for (
auto & nearest : nearests) {
719 int x = (bbox->
left() + bbox->
right()) / 2;
720 int y = (bbox->
bottom() + bbox->
top()) / 2;
725 while ((neighbour = radsearch.NextRadSearch()) !=
nullptr) {
726 if (neighbour == not_this)
continue;
729 if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist,
733 blobs->add_sorted(SortByBoxLeft<BLOBNBOX>,
true, neighbour);
739 for (
int dir = 0; dir <
BND_COUNT; ++dir) {
740 if (nearests[dir] ==
nullptr)
continue;
741 nbox = nearests[dir]->bounding_box();
742 if (AcceptableCJKMerge(*bbox, nbox, debug, max_size,
743 max_dist, &x_gap, &y_gap)) {
746 blobs->add_sorted(SortByBoxLeft<BLOBNBOX>,
true, nearests[dir]);
751 nearests[dir] =
nullptr;
755 }
else if (x_gap < 0 && x_gap <= y_gap) {
758 if (nearests[dir] ==
nullptr ||
759 y_gap < bbox->y_gap(nearests[dir]->bounding_box())) {
760 nearests[dir] = neighbour;
762 }
else if (y_gap < 0 && y_gap <= x_gap) {
765 if (nearests[dir] ==
nullptr ||
766 x_gap < bbox->x_gap(nearests[dir]->bounding_box())) {
767 nearests[dir] = neighbour;
776 for (
auto & nearest : nearests) {
777 if (nearest ==
nullptr)
continue;
778 const TBOX& nbox = nearest->bounding_box();
780 tprintf(
"Testing for overlap with:");
784 blobs->shallow_clear();
786 tprintf(
"Final box overlaps nearest\n");
799 void StrokeWidth::FindTextlineFlowDirection(
PageSegMode pageseg_mode,
800 bool display_if_debugging) {
804 gsearch.StartFullSearch();
805 while ((bbox = gsearch.NextFullSearch()) !=
nullptr) {
806 SetNeighbours(
false, display_if_debugging, bbox);
809 gsearch.StartFullSearch();
810 while ((bbox = gsearch.NextFullSearch()) !=
nullptr) {
811 SimplifyObviousNeighbours(bbox);
814 gsearch.StartFullSearch();
815 while ((bbox = gsearch.NextFullSearch()) !=
nullptr) {
816 if (FindingVerticalOnly(pageseg_mode)) {
819 }
else if (FindingHorizontalOnly(pageseg_mode)) {
823 SetNeighbourFlows(bbox);
826 if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
827 textord_tabfind_show_strokewidths > 1) {
828 initial_widths_win_ = DisplayGoodBlobs(
"InitialStrokewidths", 400, 0);
831 gsearch.StartFullSearch();
832 while ((bbox = gsearch.NextFullSearch()) !=
nullptr) {
833 SmoothNeighbourTypes(pageseg_mode,
false, bbox);
836 gsearch.StartFullSearch();
837 while ((bbox = gsearch.NextFullSearch()) !=
nullptr) {
838 SmoothNeighbourTypes(pageseg_mode,
true, bbox);
841 gsearch.StartFullSearch();
842 while ((bbox = gsearch.NextFullSearch()) !=
nullptr) {
843 SmoothNeighbourTypes(pageseg_mode,
true, bbox);
845 if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
846 textord_tabfind_show_strokewidths > 1) {
847 widths_win_ = DisplayGoodBlobs(
"ImprovedStrokewidths", 800, 0);
855 void StrokeWidth::SetNeighbours(
bool leaders,
bool activate_line_trap,
857 int line_trap_count = 0;
858 for (
int dir = 0; dir <
BND_COUNT; ++dir) {
859 auto bnd = static_cast<BlobNeighbourDir>(dir);
860 line_trap_count += FindGoodNeighbour(bnd, leaders, blob);
862 if (line_trap_count > 0 && activate_line_trap) {
884 tprintf(
"FGN in dir %d for blob:", dir);
887 int top = blob_box.
top();
888 int bottom = blob_box.
bottom();
889 int left = blob_box.
left();
890 int right = blob_box.
right();
891 int width = right - left;
892 int height = top - bottom;
900 int line_trap_count = 0;
903 ? height / 2 : width / 2;
905 ? height / 3 : width / 3;
907 min_good_overlap = min_decent_overlap = 1;
909 int search_pad = static_cast<int>(
913 TBOX search_box = blob_box;
926 search_box.
set_top(search_box.
top() + search_pad);
933 rectsearch.StartRectSearch(search_box);
935 double best_goodness = 0.0;
936 bool best_is_good =
false;
938 while ((neighbour = rectsearch.NextRectSearch()) !=
nullptr) {
940 if (neighbour == blob)
942 int mid_x = (nbox.
left() + nbox.
right()) / 2;
943 if (mid_x < blob->left_rule() || mid_x > blob->
right_rule())
952 int n_width = nbox.
width();
953 int n_height = nbox.
height();
954 if (std::min(n_width, n_height) > line_trap_min &&
955 std::max(n_width, n_height) < line_trap_max)
961 std::max(width, height)) &&
966 if (debug)
tprintf(
"Bad size\n");
978 overlap = std::min(static_cast<int>(nbox.
top()), top) - std::max(static_cast<int>(nbox.
bottom()), bottom);
980 perp_overlap = nbox.
width();
982 perp_overlap = overlap;
985 if (debug)
tprintf(
"On wrong side\n");
990 overlap = std::min(static_cast<int>(nbox.
right()), right) - std::max(static_cast<int>(nbox.
left()), left);
992 perp_overlap = nbox.
height();
994 perp_overlap = overlap;
997 if (debug)
tprintf(
"On wrong side\n");
1002 if (-gap > overlap) {
1003 if (debug)
tprintf(
"Overlaps wrong way\n");
1006 if (perp_overlap < min_decent_overlap) {
1007 if (debug)
tprintf(
"Doesn't overlap enough\n");
1012 bool is_good = overlap >= min_good_overlap && !bad_sizes &&
1019 if (gap < 1) gap = 1;
1020 double goodness = (1.0 + is_good) * overlap / gap;
1022 tprintf(
"goodness = %g vs best of %g, good=%d, overlap=%d, gap=%d\n",
1023 goodness, best_goodness, is_good, overlap, gap);
1025 if (goodness > best_goodness) {
1026 best_neighbour = neighbour;
1027 best_goodness = goodness;
1028 best_is_good = is_good;
1032 return line_trap_count;
1036 static void ListNeighbours(
const BLOBNBOX* blob,
1037 BLOBNBOX_CLIST* neighbours) {
1038 for (
int dir = 0; dir <
BND_COUNT; ++dir) {
1039 auto bnd = static_cast<BlobNeighbourDir>(dir);
1041 if (neighbour !=
nullptr) {
1042 neighbours->add_sorted(SortByBoxLeft<BLOBNBOX>,
true, neighbour);
1048 static void List2ndNeighbours(
const BLOBNBOX* blob,
1049 BLOBNBOX_CLIST* neighbours) {
1050 ListNeighbours(blob, neighbours);
1051 for (
int dir = 0; dir <
BND_COUNT; ++dir) {
1052 auto bnd = static_cast<BlobNeighbourDir>(dir);
1054 if (neighbour !=
nullptr) {
1055 ListNeighbours(neighbour, neighbours);
1061 static void List3rdNeighbours(
const BLOBNBOX* blob,
1062 BLOBNBOX_CLIST* neighbours) {
1063 List2ndNeighbours(blob, neighbours);
1064 for (
int dir = 0; dir <
BND_COUNT; ++dir) {
1065 auto bnd = static_cast<BlobNeighbourDir>(dir);
1067 if (neighbour !=
nullptr) {
1068 List2ndNeighbours(neighbour, neighbours);
1075 static void CountNeighbourGaps(
bool debug, BLOBNBOX_CLIST* neighbours,
1076 int* pure_h_count,
int* pure_v_count) {
1079 BLOBNBOX_C_IT it(neighbours);
1080 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1082 int h_min, h_max, v_min, v_max;
1085 tprintf(
"Hgaps [%d,%d], vgaps [%d,%d]:", h_min, h_max, v_min, v_max);
1086 if (h_max < v_min ||
1090 if (debug)
tprintf(
"Horz at:");
1091 }
else if (v_max < h_min) {
1094 if (debug)
tprintf(
"Vert at:");
1096 if (debug)
tprintf(
"Neither at:");
1106 void StrokeWidth::SetNeighbourFlows(
BLOBNBOX* blob) {
1112 tprintf(
"SetNeighbourFlows (current flow=%d, type=%d) on:",
1116 BLOBNBOX_CLIST neighbours;
1117 List3rdNeighbours(blob, &neighbours);
1119 int pure_h_count = 0;
1120 int pure_v_count = 0;
1121 CountNeighbourGaps(debug, &neighbours, &pure_h_count, &pure_v_count);
1125 tprintf(
"SetFlows: h_count=%d, v_count=%d\n",
1126 pure_h_count, pure_v_count);
1128 if (!neighbours.empty()) {
1131 if (pure_h_count > 2 * pure_v_count) {
1134 }
else if (pure_v_count > 2 * pure_h_count) {
1147 static void CountNeighbourTypes(BLOBNBOX_CLIST* neighbours,
1148 int* pure_h_count,
int* pure_v_count) {
1149 BLOBNBOX_C_IT it(neighbours);
1150 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1162 void StrokeWidth::SimplifyObviousNeighbours(
BLOBNBOX* blob) {
1184 int h_min, h_max, v_min, v_max;
1186 if ((h_max + margin < v_min && h_max < margin / 2) ||
1191 }
else if (v_max + margin < h_min && v_max < margin / 2) {
1201 void StrokeWidth::SmoothNeighbourTypes(
PageSegMode pageseg_mode,
bool reset_all,
1205 BLOBNBOX_CLIST neighbours;
1206 List2ndNeighbours(blob, &neighbours);
1208 int pure_h_count = 0;
1209 int pure_v_count = 0;
1210 CountNeighbourTypes(&neighbours, &pure_h_count, &pure_v_count);
1215 tprintf(
"pure_h=%d, pure_v=%d\n",
1216 pure_h_count, pure_v_count);
1218 if (pure_h_count > pure_v_count && !FindingVerticalOnly(pageseg_mode)) {
1222 }
else if (pure_v_count > pure_h_count &&
1223 !FindingHorizontalOnly(pageseg_mode)) {
1232 tprintf(
"Clean on pass 3!\n");
1250 TO_BLOCK* block, BLOBNBOX_LIST* diacritic_blobs,
1251 ColPartitionGrid* part_grid, ColPartition_LIST* big_parts,
1253 if (!FindingHorizontalOnly(pageseg_mode)) FindVerticalTextChains(part_grid);
1254 if (!FindingVerticalOnly(pageseg_mode)) FindHorizontalTextChains(part_grid);
1255 if (textord_tabfind_show_strokewidths) {
1256 chains_win_ =
MakeWindow(0, 400,
"Initial text chains");
1257 part_grid->DisplayBoxes(chains_win_);
1260 if (find_problems) {
1264 part_grid->SplitOverlappingPartitions(big_parts);
1265 EasyMerges(part_grid);
1266 RemoveLargeUnusedBlobs(block, part_grid, big_parts);
1268 while (part_grid->GridSmoothNeighbours(
BTFT_CHAIN, nontext_map_, grid_box,
1271 grid_box, rerotation));
1272 int pre_overlap = part_grid->ComputeTotalOverlap(
nullptr);
1273 TestDiacritics(part_grid, block);
1274 MergeDiacritics(block, part_grid);
1275 if (find_problems && diacritic_blobs !=
nullptr &&
1276 DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid,
1280 if (textord_tabfind_show_strokewidths) {
1281 textlines_win_ =
MakeWindow(400, 400,
"GoodTextline blobs");
1282 part_grid->DisplayBoxes(textlines_win_);
1283 diacritics_win_ = DisplayDiacritics(
"Diacritics", 0, 0, block);
1285 PartitionRemainingBlobs(pageseg_mode, part_grid);
1286 part_grid->SplitOverlappingPartitions(big_parts);
1287 EasyMerges(part_grid);
1288 while (part_grid->GridSmoothNeighbours(
BTFT_CHAIN, nontext_map_, grid_box,
1291 grid_box, rerotation));
1294 grid_box, rerotation));
1295 if (textord_tabfind_show_strokewidths) {
1296 smoothed_win_ =
MakeWindow(800, 400,
"Smoothed blobs");
1297 part_grid->DisplayBoxes(smoothed_win_);
1306 bool StrokeWidth::DetectAndRemoveNoise(
int pre_overlap,
const TBOX& grid_box,
1308 ColPartitionGrid* part_grid,
1309 BLOBNBOX_LIST* diacritic_blobs) {
1310 ColPartitionGrid* noise_grid =
nullptr;
1311 int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid);
1312 if (pre_overlap == 0) pre_overlap = 1;
1313 BLOBNBOX_IT diacritic_it(diacritic_blobs);
1314 if (noise_grid !=
nullptr) {
1318 if (textord_tabfind_show_strokewidths) {
1320 noise_grid->DisplayBoxes(noise_win);
1322 part_grid->DeleteNonLeaderParts();
1325 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1333 rsearch.StartRectSearch(search_box);
1334 ColPartition* part = rsearch.NextRectSearch();
1335 if (part !=
nullptr) {
1339 diacritic_it.add_after_then_move(blob_it.extract());
1342 noise_grid->DeleteParts();
1346 noise_grid->DeleteParts();
1359 if (next_blob ==
nullptr || next_blob->
owner() !=
nullptr ||
1368 void StrokeWidth::FindVerticalTextChains(ColPartitionGrid* part_grid) {
1374 gsearch.StartFullSearch();
1375 while ((bbox = gsearch.NextFullSearch()) !=
nullptr) {
1380 (blob = MutualUnusedVNeighbour(bbox,
BND_ABOVE)) !=
nullptr) {
1384 while (blob !=
nullptr) {
1386 blob = MutualUnusedVNeighbour(blob,
BND_ABOVE);
1388 blob = MutualUnusedVNeighbour(bbox,
BND_BELOW);
1389 while (blob !=
nullptr) {
1391 blob = MutualUnusedVNeighbour(blob,
BND_BELOW);
1393 CompletePartition(pageseg_mode, part, part_grid);
1405 if (next_blob ==
nullptr || next_blob->
owner() !=
nullptr ||
1414 void StrokeWidth::FindHorizontalTextChains(ColPartitionGrid* part_grid) {
1420 gsearch.StartFullSearch();
1421 while ((bbox = gsearch.NextFullSearch()) !=
nullptr) {
1424 (blob = MutualUnusedHNeighbour(bbox,
BND_RIGHT)) !=
nullptr) {
1428 while (blob !=
nullptr) {
1430 blob = MutualUnusedHNeighbour(blob,
BND_RIGHT);
1432 blob = MutualUnusedHNeighbour(bbox,
BND_LEFT);
1433 while (blob !=
nullptr) {
1435 blob = MutualUnusedVNeighbour(blob,
BND_LEFT);
1437 CompletePartition(pageseg_mode, part, part_grid);
1449 void StrokeWidth::TestDiacritics(ColPartitionGrid* part_grid,
TO_BLOCK* block) {
1452 small_grid.InsertBlobList(&block->
blobs);
1453 int medium_diacritics = 0;
1454 int small_diacritics = 0;
1456 for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1459 DiacriticBlob(&small_grid, blob)) {
1463 BLOBNBOX_IT blob_it(&block->
blobs);
1464 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1467 small_it.add_to_end(blob_it.extract());
1470 ColPartition* part = blob->
owner();
1471 if (part ==
nullptr && DiacriticBlob(&small_grid, blob)) {
1472 ++medium_diacritics;
1474 small_it.add_to_end(blob_it.extract());
1475 }
else if (part !=
nullptr && !part->block_owned() &&
1476 part->boxes_count() < 3) {
1482 BLOBNBOX_C_IT box_it(part->boxes());
1483 for (box_it.mark_cycle_pt(); !box_it.cycled_list() &&
1484 DiacriticBlob(&small_grid, box_it.data());
1486 if (box_it.cycled_list()) {
1488 while (!box_it.empty()) {
1497 ++medium_diacritics;
1504 small_it.add_to_end(blob_it.extract());
1505 part_grid->RemoveBBox(part);
1510 tprintf(
"Blob not available to be a diacritic at:");
1514 if (textord_tabfind_show_strokewidths) {
1515 tprintf(
"Found %d small diacritics, %d medium\n",
1516 small_diacritics, medium_diacritics);
1526 bool StrokeWidth::DiacriticBlob(BlobGrid* small_grid,
BLOBNBOX* blob) {
1532 small_box.bottom());
1534 tprintf(
"Testing blob for diacriticness at:");
1537 int x = (small_box.left() + small_box.right()) / 2;
1538 int y = (small_box.bottom() + small_box.top()) / 2;
1541 int height = small_box.height();
1554 BLOBNBOX* best_x_overlap =
nullptr;
1555 BLOBNBOX* best_y_overlap =
nullptr;
1556 int best_total_dist = 0;
1560 TBOX search_box(small_box);
1563 search_box.
pad(x_pad, y_pad);
1565 rsearch.SetUniqueMode(
true);
1567 rsearch.StartRectSearch(search_box);
1569 while ((neighbour = rsearch.NextRectSearch()) !=
nullptr) {
1571 neighbour == blob || neighbour->
owner() == blob->
owner())
1578 tprintf(
"Neighbour not strong enough:");
1583 if (nbox.
height() < min_height) {
1585 tprintf(
"Neighbour not big enough:");
1590 int x_gap = small_box.x_gap(nbox);
1591 int y_gap = small_box.y_gap(nbox);
1595 if (debug)
tprintf(
"xgap=%d, y=%d, total dist=%d\n",
1596 x_gap, y_gap, total_distance);
1597 if (total_distance >
1600 tprintf(
"Neighbour with median size %d too far away:",
1608 tprintf(
"Computing reduced box for :");
1611 int left = small_box.left() - small_box.width();
1612 int right = small_box.right() + small_box.width();
1614 y_gap = small_box.
y_gap(nbox);
1615 if (best_x_overlap ==
nullptr || y_gap < best_y_gap) {
1616 best_x_overlap = neighbour;
1624 tprintf(
"Shrunken box doesn't win:");
1628 if (best_y_overlap ==
nullptr || total_distance < best_total_dist) {
1630 tprintf(
"New best y overlap:");
1633 best_y_overlap = neighbour;
1634 best_total_dist = total_distance;
1636 tprintf(
"New y overlap box doesn't win:");
1640 tprintf(
"Neighbour wrong side of a tab:");
1644 if (best_x_overlap !=
nullptr &&
1645 (best_y_overlap ==
nullptr ||
1650 tprintf(
"DiacriticBlob OK! (x-overlap:");
1656 if (best_y_overlap !=
nullptr &&
1657 DiacriticXGapFilled(small_grid, small_box,
1659 NoNoiseInBetween(small_box, best_y_overlap->
bounding_box())) {
1663 tprintf(
"DiacriticBlob OK! (y-overlap:");
1670 tprintf(
"DiacriticBlob fails:");
1672 tprintf(
"Best x+y gap = %d, y = %d\n", best_total_dist, best_y_gap);
1673 if (best_y_overlap !=
nullptr) {
1674 tprintf(
"XGapFilled=%d, NoiseBetween=%d\n",
1675 DiacriticXGapFilled(small_grid, small_box,
1677 NoNoiseInBetween(small_box, best_y_overlap->
bounding_box()));
1696 bool StrokeWidth::DiacriticXGapFilled(BlobGrid* grid,
1697 const TBOX& diacritic_box,
1698 const TBOX& base_box) {
1702 TBOX occupied_box(base_box);
1704 while ((diacritic_gap = diacritic_box.
x_gap(occupied_box)) > max_gap) {
1705 TBOX search_box(occupied_box);
1706 if (diacritic_box.
left() > search_box.
right()) {
1716 rsearch.StartRectSearch(search_box);
1718 while ((neighbour = rsearch.NextRectSearch()) !=
nullptr) {
1720 if (nbox.
x_gap(diacritic_box) < diacritic_gap) {
1721 if (nbox.
left() < occupied_box.left())
1723 if (nbox.
right() > occupied_box.right())
1724 occupied_box.set_right(nbox.
right());
1728 if (neighbour ==
nullptr)
1735 void StrokeWidth::MergeDiacritics(
TO_BLOCK* block,
1736 ColPartitionGrid* part_grid) {
1738 for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1744 if (part !=
nullptr && !part->block_owned() && blob->
owner() ==
nullptr &&
1748 part_grid->RemoveBBox(part);
1753 part_grid->InsertBBox(
true,
true, part);
1764 void StrokeWidth::RemoveLargeUnusedBlobs(
TO_BLOCK* block,
1765 ColPartitionGrid* part_grid,
1766 ColPartition_LIST* big_parts) {
1768 for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) {
1770 ColPartition* big_part = blob->
owner();
1771 if (big_part ==
nullptr) {
1781 void StrokeWidth::PartitionRemainingBlobs(
PageSegMode pageseg_mode,
1782 ColPartitionGrid* part_grid) {
1785 int prev_grid_x = -1;
1786 int prev_grid_y = -1;
1787 BLOBNBOX_CLIST cell_list;
1788 BLOBNBOX_C_IT cell_it(&cell_list);
1789 bool cell_all_noise =
true;
1790 gsearch.StartFullSearch();
1791 while ((bbox = gsearch.NextFullSearch()) !=
nullptr) {
1792 int grid_x = gsearch.GridX();
1793 int grid_y = gsearch.GridY();
1794 if (grid_x != prev_grid_x || grid_y != prev_grid_y) {
1796 MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid,
1798 cell_it.set_to_list(&cell_list);
1799 prev_grid_x = grid_x;
1800 prev_grid_y = grid_y;
1801 cell_all_noise =
true;
1803 if (bbox->
owner() ==
nullptr) {
1804 cell_it.add_to_end(bbox);
1806 cell_all_noise =
false;
1808 cell_all_noise =
false;
1811 MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid,
1817 void StrokeWidth::MakePartitionsFromCellList(
PageSegMode pageseg_mode,
1819 ColPartitionGrid* part_grid,
1820 BLOBNBOX_CLIST* cell_list) {
1821 if (cell_list->empty())
1823 BLOBNBOX_C_IT cell_it(cell_list);
1825 BLOBNBOX* bbox = cell_it.extract();
1829 for (cell_it.forward(); !cell_it.empty(); cell_it.forward()) {
1830 part->AddBox(cell_it.extract());
1832 CompletePartition(pageseg_mode, part, part_grid);
1834 for (; !cell_it.empty(); cell_it.forward()) {
1835 BLOBNBOX* bbox = cell_it.extract();
1839 CompletePartition(pageseg_mode, part, part_grid);
1846 void StrokeWidth::CompletePartition(
PageSegMode pageseg_mode,
1848 ColPartitionGrid* part_grid) {
1849 part->ComputeLimits();
1855 if (value > 0 && FindingVerticalOnly(pageseg_mode)) {
1856 value = part->boxes_count() == 1 ? 0 : -2;
1857 }
else if (value < 0 && FindingHorizontalOnly(pageseg_mode)) {
1858 value = part->boxes_count() == 1 ? 0 : 2;
1860 part->SetRegionAndFlowTypesFromProjectionValue(value);
1862 part_grid->InsertBBox(
true,
true, part);
1867 void StrokeWidth::EasyMerges(ColPartitionGrid* part_grid) {
1868 using namespace std::placeholders;
1870 std::bind(&StrokeWidth::OrientationSearchBox,
this, _1, _2),
1871 std::bind(&StrokeWidth::ConfirmEasyMerge,
this, _1, _2));
1877 bool StrokeWidth::OrientationSearchBox(ColPartition* part,
TBOX* box) {
1878 if (part->IsVerticalType()) {
1889 bool StrokeWidth::ConfirmEasyMerge(
const ColPartition* p1,
1890 const ColPartition* p2) {
1896 if ((p1->IsVerticalType() || p2->IsVerticalType()) &&
1897 p1->HCoreOverlap(*p2) <= 0 &&
1898 ((!p1->IsSingleton() &&
1899 !p2->IsSingleton()) ||
1900 !p1->bounding_box().major_overlap(p2->bounding_box())))
1902 if ((p1->IsHorizontalType() || p2->IsHorizontalType()) &&
1903 p1->VCoreOverlap(*p2) <= 0 &&
1904 ((!p1->IsSingleton() &&
1905 !p2->IsSingleton()) ||
1906 (!p1->bounding_box().major_overlap(p2->bounding_box()) &&
1907 !p1->OKDiacriticMerge(*p2,
false) &&
1908 !p2->OKDiacriticMerge(*p1,
false))))
1910 if (!p1->ConfirmNoTabViolation(*p2))
1914 return NoNoiseInBetween(p1->bounding_box(), p2->bounding_box());
1918 bool StrokeWidth::NoNoiseInBetween(
const TBOX& box1,
const TBOX& box2)
const {
1926 ScrollView* StrokeWidth::DisplayGoodBlobs(
const char* window_name,
1929 #ifndef GRAPHICS_DISABLED
1936 gsearch.StartFullSearch();
1938 while ((bbox = gsearch.NextFullSearch()) !=
nullptr) {
1940 int left_x = box.
left();
1941 int right_x = box.
right();
1942 int top_y = box.
top();
1943 int bottom_y = box.
bottom();
1954 else if (goodness == 1)
1960 window->
Rectangle(left_x, bottom_y, right_x, top_y);
1968 #ifndef GRAPHICS_DISABLED
1970 int top = std::max(static_cast<int>(blob_box.
top()), blob->
base_char_top());
1972 int x = (blob_box.
left() + blob_box.
right()) / 2;
1973 window->
Line(x, top, x, bottom);
1974 #endif // GRAPHICS_DISABLED
1978 ScrollView* StrokeWidth::DisplayDiacritics(
const char* window_name,
1981 #ifndef GRAPHICS_DISABLED
1986 BLOBNBOX_IT it(&block->
blobs);
1987 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1991 DrawDiacriticJoiner(blob, window);
1999 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2003 DrawDiacriticJoiner(blob, window);