37 #include "config_auto.h"
56 static BOOL_VAR(textord_biased_skewcalc,
true,
"Bias skew estimates with line length");
57 static BOOL_VAR(textord_interpolating_skew,
true,
"Interpolate across gaps");
58 static INT_VAR(textord_skewsmooth_offset, 4,
"For smooth factor");
59 static INT_VAR(textord_skewsmooth_offset2, 1,
"For smooth factor");
65 static INT_VAR(textord_max_blob_overlaps, 4,
66 "Max number of blobs a big blob can overlap");
69 "Fraction of line spacing for quad");
71 "Fraction of line spacing for outlier");
77 static double_VAR(textord_expansion_factor, 1.0,
78 "Factor to expand rows by in expand_rows");
79 static double_VAR(textord_overlap_x, 0.375,
"Fraction of linespace for good overlap");
83 "New row made if blob makes row this big");
87 "Min blob height/top to include blob top into xheight stats");
89 "Min pile height to make xheight");
91 "Min pile height to make ascheight");
92 static double_VAR(textord_descheight_mode_fraction, 0.08,
93 "Min pile height to make descheight");
103 #define MAX_HEIGHT_MODES 12
109 static float MakeRowFromBlobs(
float line_size,
110 BLOBNBOX_IT* blob_it, TO_ROW_IT* row_it) {
112 blob_it->move_to_first();
114 float total_size = 0.0f;
117 for (; !blob_it->empty(); blob_it->forward()) {
118 BLOBNBOX* blob = blob_it->extract();
121 if (row ==
nullptr) {
122 row =
new TO_ROW(blob, top, bottom, line_size);
123 row_it->add_before_then_move(row);
125 row->
add_blob(blob, top, bottom, line_size);
127 total_size += top - bottom;
130 return blob_count > 0 ? total_size / blob_count : total_size;
139 C_OUTLINE_IT ol_it(blob->
out_list());
141 ol_it.set_to_list(ol_it.data()->child());
144 for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
150 bb_it.add_after_then_move(bbox);
153 return MakeRowFromBlobs(block->
line_size, &bb_it, row_it);
164 TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
165 BLOBNBOX_IT blob_it = &block->
blobs;
166 TO_ROW_IT row_it = block->
get_rows();
172 if (block->
blobs.singleton() && allow_sub_blobs) {
173 blob_it.move_to_first();
174 float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it);
177 }
else if (block->
blobs.empty()) {
182 blob_it.add_after_then_move(bblob);
184 MakeRowFromBlobs(block->
line_size, &blob_it, &row_it);
186 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward())
203 TO_BLOCK_IT block_it;
205 block_it.set_to_list(port_blocks);
206 for (block_it.mark_cycle_pt(); !block_it.cycled_list();
212 block_it.set_to_list(port_blocks);
213 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
215 block_it.data()->block->pdblk.bounding_box().left(),
232 TO_ROW_IT row_it = block->
get_rows ();
234 #ifndef GRAPHICS_DISABLED
244 row_it.move_to_first ();
245 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
247 #ifndef GRAPHICS_DISABLED
250 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
252 colour = static_cast<ScrollView::Color>(colour + 1);
271 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
272 const TBOX& box = blob_it.data()->bounding_box();
275 double error = lms.
Fit(&m, &c);
287 TO_BLOCK_LIST *blocks,
296 TO_BLOCK_IT block_it = blocks;
300 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
301 block_it.forward ()) {
302 POLY_BLOCK* pb = block_it.data()->block->pdblk.poly_block();
303 if (pb !=
nullptr && !pb->
IsText())
305 row_count += block_it.data ()->get_rows ()->length ();
307 TO_ROW_IT row_it(block_it.data()->get_rows());
308 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
309 blob_count += row_it.data ()->blob_list ()->length ();
311 if (row_count == 0) {
317 std::vector<float> gradients(blob_count);
319 std::vector<float> errors(blob_count);
322 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
323 block_it.forward ()) {
324 POLY_BLOCK* pb = block_it.data()->block->pdblk.poly_block();
325 if (pb !=
nullptr && !pb->
IsText())
327 TO_ROW_IT row_it(block_it.data ()->get_rows());
328 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
329 row = row_it.data ();
330 blob_count = row->
blob_list ()->length ();
331 row_err = static_cast<int32_t>(ceil (row->
line_error ()));
334 if (textord_biased_skewcalc) {
335 blob_count /= row_err;
336 for (blob_count /= row_err; blob_count > 0; blob_count--) {
337 gradients[row_index] = row->
line_m ();
344 gradients[row_index] = row->
line_m ();
350 if (row_index == 0) {
352 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
353 block_it.forward ()) {
354 POLY_BLOCK* pb = block_it.data()->block->pdblk.poly_block();
355 if (pb !=
nullptr && !pb->
IsText())
357 TO_ROW_IT row_it(block_it.data()->get_rows());
358 for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
360 row = row_it.data ();
361 gradients[row_index] = row->
line_m ();
367 row_count = row_index;
369 &gradients[0], row_count);
370 page_m = gradients[row_index];
372 &errors[0], row_count);
373 page_err = errors[row_index];
388 int overlap = std::min(dotbox.
right(), ibox.
right()) -
389 std::max(dotbox.
left(), ibox.
left());
391 (overlap * 2 < ibox.
width() && overlap < dotbox.
width()))
401 const double kHeightFraction = 0.6;
402 double target_height = std::min(dotbox.
bottom(), ibox.
top());
404 target_height *= kHeightFraction;
405 int left_min = dotbox.
left() - dotbox.
width();
406 int middle = (dotbox.
left() + dotbox.
right())/2;
407 int right_max = dotbox.
right() + dotbox.
width();
412 bool found_left =
false;
413 bool found_right =
false;
414 bool in_left =
false;
415 bool in_right =
false;
417 C_OUTLINE_IT o_it = blob->
out_list();
418 for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) {
422 for (
int step = 0; step < length; pos += outline->
step(step++)) {
425 if (x >= left_min && x < middle && !found_left) {
428 if (y > left_maxy) left_maxy = y;
429 if (y < left_miny) left_miny = y;
431 left_maxy = left_miny = y;
434 }
else if (in_left) {
436 if (left_maxy - left_miny > target_height) {
443 if (x <= right_max && x > middle && !found_right) {
446 if (y > right_maxy) right_maxy = y;
447 if (y < right_miny) right_miny = y;
449 right_maxy = right_miny = y;
452 }
else if (in_right) {
454 if (right_maxy - right_miny > target_height) {
467 TO_ROW_IT row_it = block->
get_rows ();
468 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
469 TO_ROW* row = row_it.data();
473 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
478 STATS hstats(0, max_height + 1);
479 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
485 float xheight = hstats.
median();
488 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
493 if (prev !=
nullptr) {
494 if (dot_of_i(blob, prev, row))
497 if (!b_it.at_last()) {
498 BLOBNBOX* next = b_it.data_relative(1);
499 if (dot_of_i(blob, next, row))
503 delete blob->
cblob();
504 delete b_it.extract();
526 BLOBNBOX_IT blob_it = &block->
blobs;
527 TO_ROW_IT row_it = block->
get_rows ();
529 #ifndef GRAPHICS_DISABLED
546 expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on);
547 blob_it.set_to_list (&block->
blobs);
548 row_it.set_to_list (block->
get_rows ());
549 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
550 blob_it.add_list_after (row_it.data ()->blob_list ());
554 blob_it.set_to_list (&block->
blobs);
558 blob_it.set_to_list (&block->
blobs);
586 TO_ROW_IT row_it = block->
get_rows ();
587 BLOBNBOX_IT blob_it = &block->
blobs;
589 if (row_it.length () == 0)
594 min_y = block_box.
bottom () - 1;
595 max_y = block_box.
top () + 1;
596 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
597 line_index = static_cast<int32_t>(floor (row_it.data ()->intercept ()));
598 if (line_index <= min_y)
599 min_y = line_index - 1;
600 if (line_index >= max_y)
601 max_y = line_index + 1;
603 line_count = max_y - min_y + 1;
607 std::vector<int32_t> deltas(line_count);
609 std::vector<int32_t> occupation(line_count);
618 max_y - min_y + 1, &occupation[0], &deltas[0]);
619 #ifndef GRAPHICS_DISABLED
621 draw_occupation(xleft, ybottom, min_y, max_y, &occupation[0], &deltas[0]);
625 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
626 row = row_it.data ();
627 line_index = static_cast<int32_t>(floor (row->
intercept ()));
628 distance = deltas[line_index - min_y];
630 line_index, &row_it, testing_on)) {
631 #ifndef GRAPHICS_DISABLED
636 blob_it.add_list_after (row_it.data ()->blob_list ());
637 delete row_it.extract ();
640 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
641 blob_it.add_list_after (row_it.data ()->blob_list ());
667 tprintf (
"Row at %g(%g), dropout dist=%d,",
677 if (abs_dist > dist_limit) {
679 tprintf (
" too far - deleting\n");
683 if ((distance < 0 && !row_it->at_last ())
684 || (
distance >= 0 && !row_it->at_first ())) {
685 row_offset = row_inc;
687 next_row = row_it->data_relative (row_offset);
688 next_index = static_cast<int32_t>(floor (next_row->
intercept ()));
690 && next_index < line_index
693 && next_index > line_index
696 tprintf (
" nearer neighbour (%d) at %g\n",
702 else if (next_index == line_index
706 tprintf (
" equal but more believable at %g (%g/%g)\n",
714 row_offset += row_inc;
716 while ((next_index == line_index
718 && row_offset < row_it->length ());
740 TO_ROW_IT row_it = block->
get_rows ();
745 length = sqrt (gradient * gradient + 1);
746 rotation =
FCOORD (1 / length, -gradient / length);
747 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
748 row = row_it.data ();
750 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
751 blob_it.forward ()) {
752 blob = blob_it.data ();
754 blob_box.
rotate (rotation);
780 TO_ROW_IT row_it = block->
get_rows ();
787 line_count = max_y - min_y + 1;
788 length = sqrt (gradient * gradient + 1);
789 rotation =
FCOORD (1 / length, -gradient / length);
790 for (line_index = 0; line_index < line_count; line_index++)
791 deltas[line_index] = 0;
792 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
793 row = row_it.data ();
795 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
796 blob_it.forward ()) {
797 blob = blob_it.data ();
799 blob_box.
rotate (rotation);
800 int32_t width = blob_box.
right() - blob_box.
left();
801 index = blob_box.
bottom() - min_y;
804 deltas[index] += width;
805 index = blob_box.
top() - min_y;
807 deltas[index] -= width;
810 occupation[0] = deltas[0];
811 for (line_index = 1; line_index < line_count; line_index++)
812 occupation[line_index] = occupation[line_index - 1] + deltas[line_index];
839 if (low_window + high_window < line_count) {
840 for (sum = 0, high_index = 0; high_index < low_window; high_index++)
841 sum += occupation[high_index];
842 for (low_index = 0; low_index < high_window; low_index++, high_index++)
843 sum += occupation[high_index];
844 min_occ = occupation[0];
846 for (test_index = 1; test_index < high_index; test_index++) {
847 if (occupation[test_index] <= min_occ) {
848 min_occ = occupation[test_index];
849 min_index = test_index;
852 for (line_index = 0; line_index < low_window; line_index++)
853 thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
855 for (low_index = 0; high_index < line_count; low_index++, high_index++) {
856 sum -= occupation[low_index];
857 sum += occupation[high_index];
858 if (occupation[high_index] <= min_occ) {
860 min_occ = occupation[high_index];
861 min_index = high_index;
864 if (min_index <= low_index) {
865 min_occ = occupation[low_index + 1];
866 min_index = low_index + 1;
867 for (test_index = low_index + 2; test_index <= high_index;
869 if (occupation[test_index] <= min_occ) {
870 min_occ = occupation[test_index];
872 min_index = test_index;
876 thresholds[line_index++] = (sum - min_occ) / divisor + min_occ;
880 min_occ = occupation[0];
882 for (sum = 0, low_index = 0; low_index < line_count; low_index++) {
883 if (occupation[low_index] < min_occ) {
884 min_occ = occupation[low_index];
885 min_index = low_index;
887 sum += occupation[low_index];
891 for (; line_index < line_count; line_index++)
892 thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
911 int32_t prev_threshold;
918 prev_threshold = thresholds[line_index];
923 while (line_index < line_count
924 && (occupation[line_index] < thresholds[line_index]
925 || occupation[line_index - 1] >= prev_threshold));
926 if (line_index < line_count) {
927 back_index = line_index - 1;
929 while (next_dist < -distance && back_index >= 0) {
930 thresholds[back_index] = next_dist;
938 while (line_index < line_count);
959 float y_bottom, y_top;
963 BLOBNBOX_IT blob_it = &block->
blobs;
964 TO_ROW_IT row_it = block->
get_rows ();
966 #ifndef GRAPHICS_DISABLED
975 if (block->
get_rows ()->length () == 0)
981 if (block->
get_rows ()->length () == 0)
990 row_it.move_to_last ();
992 row = row_it.data ();
993 y_max = row->
max_y ();
994 y_min = row->
min_y ();
1000 if (y_min > y_bottom) {
1002 tprintf(
"Expanding bottom of row at %f from %f to %f\n",
1005 swallowed_row =
true;
1006 while (swallowed_row && !row_it.at_last ()) {
1007 swallowed_row =
false;
1009 test_row = row_it.data_relative (1);
1011 if (test_row->
max_y () > y_bottom) {
1012 if (test_row->
min_y () > y_bottom) {
1016 #ifndef GRAPHICS_DISABLED
1024 blob_it.set_to_list (row->
blob_list ());
1025 blob_it.add_list_after (test_row->
blob_list ());
1027 delete row_it.extract ();
1029 swallowed_row =
true;
1031 else if (test_row->
max_y () < y_min) {
1033 y_bottom = test_row->
max_y ();
1035 tprintf(
"Truncating limit to %f due to touching row at %f\n",
1041 tprintf(
"Not expanding limit beyond %f due to touching row at %f\n",
1048 if (y_max < y_top) {
1050 tprintf(
"Expanding top of row at %f from %f to %f\n",
1052 swallowed_row =
true;
1053 while (swallowed_row && !row_it.at_first ()) {
1054 swallowed_row =
false;
1056 test_row = row_it.data_relative (-1);
1057 if (test_row->
min_y () < y_top) {
1058 if (test_row->
max_y () < y_top) {
1062 blob_it.set_to_list (row->
blob_list ());
1063 #ifndef GRAPHICS_DISABLED
1071 blob_it.add_list_after (test_row->
blob_list ());
1073 delete row_it.extract ();
1075 swallowed_row =
true;
1077 else if (test_row->
min_y () < y_max) {
1079 y_top = test_row->
min_y ();
1081 tprintf(
"Truncating limit to %f due to touching row at %f\n",
1087 tprintf(
"Not expanding limit beyond %f due to touching row at %f\n",
1098 while (!row_it.at_last ());
1114 TO_ROW_IT row_it = block->
get_rows ();
1117 tprintf(
"Adjusting row limits for block(%d,%d)\n",
1120 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1121 row = row_it.data ();
1124 tprintf(
"Row at %f has min %f, max %f, size %f\n",
1151 TO_ROW_IT row_it = block->
get_rows ();
1153 int16_t rowcount = row_it.length ();
1155 std::vector<TO_ROW*> rows(rowcount);
1158 row_it.move_to_last ();
1160 row = row_it.data ();
1161 if (prev_row !=
nullptr) {
1162 rows[rowcount++] = prev_row;
1165 tprintf (
"Row at %g yields spacing of %g\n",
1171 while (!row_it.at_last ());
1176 tprintf (
"Blob based spacing=(%g,%g), offset=%g",
1181 iqr = rows[row_index]->spacing;
1184 iqr -= rows[row_index]->spacing;
1187 block->
key_row = rows[row_index];
1189 tprintf (
" row based=%g(%g)", rows[row_index]->spacing, iqr);
1193 if (rows[row_index]->spacing < block->line_spacing
1194 && rows[row_index]->spacing > block->
line_size)
1196 block->
line_size = rows[row_index]->spacing;
1198 else if (rows[row_index]->spacing > block->
line_spacing)
1203 if (rows[row_index]->spacing < block->line_spacing)
1204 block->
line_size = rows[row_index]->spacing;
1219 tprintf (
"\nEstimate line size=%g, spacing=%g, offset=%g\n",
1260 int32_t min_height, max_height;
1261 TO_ROW_IT row_it = block->
get_rows();
1262 if (row_it.empty())
return;
1267 STATS row_asc_xheights(min_height, max_height + 1);
1268 STATS row_asc_ascrise(static_cast<int>(min_height * asc_frac_xheight),
1269 static_cast<int>(max_height * asc_frac_xheight) + 1);
1270 int min_desc_height = static_cast<int>(min_height * desc_frac_xheight);
1271 int max_desc_height = static_cast<int>(max_height * desc_frac_xheight);
1272 STATS row_asc_descdrop(min_desc_height, max_desc_height + 1);
1273 STATS row_desc_xheights(min_height, max_height + 1);
1274 STATS row_desc_descdrop(min_desc_height, max_desc_height + 1);
1275 STATS row_cap_xheights(min_height, max_height + 1);
1276 STATS row_cap_floating_xheights(min_height, max_height + 1);
1277 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1278 row = row_it.data();
1286 row_asc_xheights.
add(static_cast<int32_t>(row->
xheight),
1288 row_asc_ascrise.
add(static_cast<int32_t>(row->
ascrise),
1290 row_asc_descdrop.
add(static_cast<int32_t>(-row->
descdrop),
1293 row_desc_xheights.
add(static_cast<int32_t>(row->
xheight),
1295 row_desc_descdrop.
add(static_cast<int32_t>(-row->
descdrop),
1299 &row_cap_xheights, &row_cap_floating_xheights);
1303 float xheight = 0.0;
1304 float ascrise = 0.0;
1305 float descdrop = 0.0;
1309 xheight = row_asc_xheights.
median();
1310 ascrise = row_asc_ascrise.
median();
1311 descdrop = -row_asc_descdrop.
median();
1312 }
else if (row_desc_xheights.
get_total() > 0) {
1314 xheight = row_desc_xheights.
median();
1315 descdrop = -row_desc_descdrop.
median();
1316 }
else if (row_cap_xheights.
get_total() > 0) {
1327 min_height, max_height, &(xheight), &(ascrise));
1335 bool corrected_xheight =
false;
1338 corrected_xheight =
true;
1340 if (corrected_xheight || ascrise <= 0.0) {
1341 ascrise = xheight * asc_frac_xheight;
1343 if (corrected_xheight || descdrop >= 0.0) {
1344 descdrop = -(xheight * desc_frac_xheight);
1349 tprintf(
"Block average xheight=%.4f, ascrise=%.4f, descdrop=%.4f\n",
1350 xheight, ascrise, descdrop);
1353 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1369 int block_line_size) {
1377 int min_height, max_height;
1379 STATS heights(min_height, max_height + 1);
1380 STATS floating_heights(min_height, max_height + 1);
1382 &heights, &floating_heights);
1388 rotation.
y() == 0.0,
1389 min_height, max_height,
1393 row->
descdrop = static_cast<float>(
1407 int max_height,
STATS *heights,
STATS *floating_heights) {
1414 if (blob_it.empty())
return;
1415 bool has_rep_chars =
1418 blob = blob_it.data();
1427 top -= gradient * xcentre + row->
parallel_c();
1428 if (top >= min_height && top <= max_height) {
1429 heights->
add(static_cast<int32_t>(floor(top + 0.5)), 1);
1431 floating_heights->
add(static_cast<int32_t>(floor(top + 0.5)), 1);
1439 while (!blob_it.at_first() &&
1440 blob_it.data()->repeated_set() == repeated_set) {
1443 tprintf(
"Skipping repeated char when computing xheight\n");
1448 }
while (!blob_it.at_first());
1468 STATS *heights,
STATS *floating_heights,
bool cap_only,
int min_height,
1469 int max_height,
float *xheight,
float *ascrise) {
1470 int blob_index = heights->
mode();
1471 int blob_count = heights->
pile_count(blob_index);
1473 tprintf(
"min_height=%d, max_height=%d, mode=%d, count=%d, total=%d\n",
1474 min_height, max_height, blob_index, blob_count,
1477 floating_heights->
print();
1479 if (blob_count == 0)
return 0;
1481 bool in_best_pile =
false;
1482 int prev_size = -INT32_MAX;
1486 if (cap_only && mode_count > 1)
1490 tprintf(
"found %d modes: ", mode_count);
1491 for (x = 0; x < mode_count; x++)
tprintf(
"%d ", modes[x]);
1495 for (x = 0; x < mode_count - 1; x++) {
1496 if (modes[x] != prev_size + 1)
1497 in_best_pile =
false;
1498 int modes_x_count = heights->
pile_count(modes[x]) -
1501 (in_best_pile || modes_x_count > best_count)) {
1502 for (
int asc = x + 1; asc < mode_count; asc++) {
1504 static_cast<float>(modes[asc]) / static_cast<float>(modes[x]);
1509 if (modes_x_count > best_count) {
1510 in_best_pile =
true;
1511 best_count = modes_x_count;
1514 tprintf(
"X=%d, asc=%d, count=%d, ratio=%g\n",
1515 modes[x], modes[asc]-modes[x], modes_x_count, ratio);
1517 prev_size = modes[x];
1518 *xheight = static_cast<float>(modes[x]);
1519 *ascrise = static_cast<float>(modes[asc] - modes[x]);
1524 if (*xheight == 0) {
1531 if (floating_heights->
get_total() > 0) {
1532 for (x = min_height; x < max_height; ++x) {
1535 blob_index = heights->
mode();
1536 for (x = min_height; x < max_height; ++x) {
1540 *xheight = static_cast<float>(blob_index);
1542 best_count = heights->
pile_count(blob_index);
1544 tprintf(
"Single mode xheight set to %g\n", *xheight);
1546 tprintf(
"Multi-mode xheight set to %g, asc=%g\n", *xheight, *ascrise);
1564 int xheight_blob_count,
STATS *asc_heights) {
1568 i_min = static_cast<int>(
1575 int num_potential_asc = 0;
1576 for (
int i = i_min; i <= i_max; ++i) {
1577 num_potential_asc += asc_heights->
pile_count(i);
1587 STATS heights (min_height, max_height + 1);
1588 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1589 blob = blob_it.data();
1593 height = (gradient * xcentre + row->
parallel_c() -
1595 if (height >= min_height && height <= max_height)
1596 heights.
add(static_cast<int>(floor(height + 0.5)), 1);
1599 int blob_index = heights.
mode();
1600 int blob_count = heights.
pile_count(blob_index);
1601 float total_fraction =
1603 if (static_cast<float>(blob_count + num_potential_asc) <
1604 xheight_blob_count * total_fraction) {
1607 int descdrop = blob_count > 0 ? -blob_index : 0;
1609 tprintf(
"Descdrop: %d (potential ascenders %d, descenders %d)\n",
1610 descdrop, num_potential_asc, blob_count);
1631 int32_t least_count;
1632 int32_t least_index;
1635 src_count = max_height + 1 - min_height;
1637 least_count = INT32_MAX;
1639 for (src_index = 0; src_index < src_count; src_index++) {
1640 pile_count = heights->
pile_count(min_height + src_index);
1641 if (pile_count > 0) {
1642 if (dest_count < maxmodes) {
1643 if (pile_count < least_count) {
1645 least_count = pile_count;
1646 least_index = dest_count;
1648 modes[dest_count++] = min_height + src_index;
1649 }
else if (pile_count >= least_count) {
1650 while (least_index < maxmodes - 1) {
1651 modes[least_index] = modes[least_index + 1];
1656 modes[maxmodes - 1] = min_height + src_index;
1657 if (pile_count == least_count) {
1659 least_index = maxmodes - 1;
1663 for (dest_count = 1; dest_count < maxmodes; dest_count++) {
1664 pile_count = heights->
pile_count(modes[dest_count]);
1665 if (pile_count < least_count) {
1667 least_count = pile_count;
1668 least_index = dest_count;
1686 float ascrise,
float descdrop) {
1689 tprintf(
"correcting row xheight: row->xheight %.4f"
1690 ", row->acrise %.4f row->descdrop %.4f\n",
1693 bool normal_xheight =
1710 (normal_xheight || cap_xheight)) ||
1711 (row_category ==
ROW_UNKNOWN && normal_xheight)) {
1735 if (row->
xheight < xheight + ascrise && row->
xheight > xheight) {
1738 tprintf(
"all caps with irregular xheight\n");
1747 tprintf(
"corrected row->xheight = %.4f, row->acrise = %.4f, row->descdrop"
1752 static int CountOverlaps(
const TBOX& box,
int min_height,
1753 BLOBNBOX_LIST* blobs) {
1755 BLOBNBOX_IT blob_it(blobs);
1756 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1783 BLOBNBOX_IT blob_it;
1787 TO_ROW_IT row_it = block->
get_rows();
1792 length = sqrt(1 + gradient * gradient);
1793 g_vec =
FCOORD(1 / length, -gradient / length);
1794 blob_rotation =
FCOORD(rotation.
x(), -rotation.
y());
1795 blob_rotation.
rotate(g_vec);
1796 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1797 row = row_it.data();
1800 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
1801 blob_it.forward()) {
1802 blob = blob_it.data();
1810 rotated_blob, static_cast<int16_t>(row->
intercept()),
1811 static_cast<int16_t>(
1815 under_it.add_after_then_move(blob_it.extract());
1817 tprintf(
"Underlined blob at:");
1818 rotated_blob->bounding_box().print();
1822 }
else if (CountOverlaps(blob->
bounding_box(), min_blob_height,
1824 textord_max_blob_overlaps) {
1825 large_it.add_after_then_move(blob_it.extract());
1827 tprintf(
"Large blob overlaps %d blobs at:",
1828 CountOverlaps(blob_box, min_blob_height,
1833 delete rotated_blob;
1851 #ifndef GRAPHICS_DISABLED
1858 BLOBNBOX_IT blob_it;
1859 BLOBNBOX_IT start_it;
1860 TO_ROW_IT row_it = block->
get_rows ();
1862 #ifndef GRAPHICS_DISABLED
1866 blob_rotation =
FCOORD (rotation.
x (), -rotation.
y ());
1867 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1869 blob_it.set_to_list (row_it.data ()->blob_list ());
1870 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1871 blob_it.forward ()) {
1872 blob = blob_it.data ();
1885 if (!blob_it.at_last ()) {
1886 nextblob = blob_it.data_relative(1);
1889 blob->
merge(nextblob);
1896 blob->
chop (&start_it, &blob_it,
1902 #ifndef GRAPHICS_DISABLED
1907 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1908 blob_it.forward ()) {
1909 blob = blob_it.data ();
1911 blob_box.
rotate (rotation);
1914 blob_box.
right (), blob_box.
top ());
1917 colour = static_cast<ScrollView::Color>(colour + 1);
1938 #ifndef GRAPHICS_DISABLED
1941 TO_ROW_IT row_it = block->
get_rows ();
1943 row_it.move_to_first ();
1944 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1945 if (row_it.data ()->blob_list ()->empty ())
1946 delete row_it.extract ();
1950 #ifndef GRAPHICS_DISABLED
1953 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1955 block_edge, colour, rotation);
1956 colour = static_cast<ScrollView::Color>(colour + 1);
1980 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1981 if (!blob_it.data()->joined_to_prev()) {
1982 const TBOX& box = blob_it.data()->bounding_box();
1990 error = lms.
Fit(&gradient, &c);
2006 #ifndef GRAPHICS_DISABLED
2009 TO_ROW_IT row_it = block->
get_rows ();
2011 row_it.move_to_first ();
2012 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
2013 if (row_it.data ()->blob_list ()->empty ())
2014 delete row_it.extract ();
2019 #ifndef GRAPHICS_DISABLED
2022 for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
2023 row_it.forward ()) {
2024 row_it.data ()->baseline.plot (
to_win, colour);
2025 colour = static_cast<ScrollView::Color>(colour + 1);
2031 make_old_baselines(block, testing_on, gradient);
2033 #ifndef GRAPHICS_DISABLED
2036 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
2037 row_it.data ()->baseline.plot (
to_win, colour);
2038 colour = static_cast<ScrollView::Color>(colour + 1);
2062 auto *xstarts =
new int32_t[row->
blob_list()->length() + 1];
2067 xstarts[1] = xstarts[segments];
2069 coeffs =
new double[3];
2071 coeffs[1] = row->
line_m ();
2072 coeffs[2] = row->
line_c ();
2104 BLOBNBOX_IT blob_it = row->
blob_list ();
2105 BLOBNBOX_IT new_it = blob_it;
2108 needs_curve =
false;
2110 xstarts[0] = box.
left ();
2112 blobcount = row->
blob_list ()->length ();
2114 tprintf (
"Segmenting baseline of %d blobs at (%d,%d)\n",
2118 blob_it.move_to_last ();
2119 box = blob_it.data ()->bounding_box ();
2120 xstarts[1] = box.
right ();
2124 new_it.mark_cycle_pt ();
2127 middle = (new_box.
left () + new_box.
right ()) / 2.0;
2130 yshifts.
add (yshift, blobindex);
2131 if (new_it.cycled_list ()) {
2132 xstarts[1] = new_box.
right ();
2153 xstarts[segments++] = box.
left ();
2159 middle = (new_box.
left () + new_box.
right ()) / 2.0;
2161 yshifts.
add (yshift, blobindex);
2165 while (!new_it.cycled_list ());
2167 xstarts[segments] = new_box.
right ();
2170 xstarts[--segments] = new_box.
right ();
2173 tprintf (
"Made %d segments on row at (%d,%d)\n",
2196 int blobs_per_segment;
2200 BLOBNBOX_IT blob_it = row->
blob_list ();
2201 BLOBNBOX_IT new_it = blob_it;
2207 xstarts[0] = box.
left ();
2209 while (!blob_it.at_first ()) {
2216 blobs_per_segment = blobcount / segments;
2218 auto *coeffs =
new double[segments * 3];
2221 (
"Linear splining baseline of %d blobs at (%d,%d), into %d segments of %d blobs\n",
2222 blobcount, box.
left (), box.
bottom (), segments, blobs_per_segment);
2224 for (index2 = 0; index2 < blobs_per_segment / 2; index2++)
2229 blobindex += blobs_per_segment;
2231 while (index1 < blobindex || (segment == segments && index1 < blobcount)) {
2233 int middle = (box.
left() + box.
right()) / 2;
2236 if (index1 == blobindex - blobs_per_segment / 2
2237 || index1 == blobcount - 1) {
2238 xstarts[segment] = box.
left ();
2242 coeffs[segment * 3 - 3] = 0;
2243 coeffs[segment * 3 - 2] = b;
2244 coeffs[segment * 3 - 1] = c;
2246 if (segment > segments)
2249 blobindex += blobs_per_segment;
2251 while (index2 < blobindex || (segment == segments && index2 < blobcount)) {
2253 int middle = (new_box.
left() + new_box.
right()) / 2;
2256 if (index2 == blobindex - blobs_per_segment / 2
2257 || index2 == blobcount - 1) {
2258 xstarts[segment] = new_box.
left ();
2262 coeffs[segment * 3 - 3] = 0;
2263 coeffs[segment * 3 - 2] = b;
2264 coeffs[segment * 3 - 1] = c;
2267 while (segment <= segments);
2289 float g_length = 1.0f;
2294 float smooth_factor;
2299 TO_ROW *dest_row =
nullptr;
2301 BLOBNBOX_IT blob_it = &block->
blobs;
2302 TO_ROW_IT row_it = block->
get_rows ();
2307 if (gradient !=
nullptr)
2308 g_length = sqrt (1 + *gradient * *gradient);
2309 #ifndef GRAPHICS_DISABLED
2315 smooth_factor = 1.0;
2317 row_count = row_it.length ();
2318 if (!blob_it.empty ()) {
2319 left_x = blob_it.data ()->bounding_box ().left ();
2325 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
2326 blob = blob_it.data ();
2327 if (gradient !=
nullptr) {
2332 && last_x - left_x > block->
line_size * 2
2333 && textord_interpolating_skew) {
2336 / (last_x - left_x);
2342 #ifndef GRAPHICS_DISABLED
2346 if (!row_it.empty ()) {
2347 for (row_it.move_to_first ();
2348 !row_it.at_last () && row_it.data ()->min_y () > top;
2350 row = row_it.data ();
2351 if (row->
min_y () <= top && row->
max_y () >= bottom) {
2359 if (overlap_result ==
NEW_ROW && !reject_misses)
2364 if (!make_new_rows) {
2365 near_dist = row_it.data_relative (-1)->min_y () - top;
2367 if (bottom < row->min_y ()) {
2368 if (row->
min_y () - bottom <=
2376 else if (near_dist > 0
2377 && near_dist < bottom - row->max_y ()) {
2379 dest_row = row_it.data ();
2380 if (dest_row->
min_y () - bottom <=
2388 if (top - row->
max_y () <=
2390 block->
line_size) * (textord_overlap_x +
2399 if (overlap_result ==
ASSIGN)
2400 dest_row->
add_blob (blob_it.extract (), top, bottom,
2402 if (overlap_result ==
NEW_ROW) {
2403 if (make_new_rows && top - bottom < block->max_blob_size) {
2405 new TO_ROW (blob_it.extract (), top, bottom,
2408 if (bottom > row_it.data ()->min_y ())
2409 row_it.add_before_then_move (dest_row);
2412 row_it.add_after_then_move (dest_row);
2415 textord_skewsmooth_offset);
2421 else if (make_new_rows && top - bottom < block->max_blob_size) {
2426 row_it.add_after_then_move(dest_row);
2428 textord_skewsmooth_offset2);
2433 if (overlap_result !=
REJECT) {
2434 tprintf(
"Test blob assigned to row at (%g,%g) on pass %d\n",
2438 tprintf(
"Test blob assigned to no row on pass %d\n", pass);
2441 if (overlap_result !=
REJECT) {
2442 while (!row_it.at_first() &&
2443 row_it.data()->min_y() > row_it.data_relative(-1)->min_y()) {
2444 row = row_it.extract();
2446 row_it.add_before_then_move(row);
2448 while (!row_it.at_last() &&
2449 row_it.data ()->min_y() < row_it.data_relative (1)->min_y()) {
2450 row = row_it.extract();
2453 row_it.add_after_then_move(row);
2455 BLOBNBOX_IT added_blob_it(dest_row->
blob_list());
2456 added_blob_it.move_to_last();
2457 TBOX prev_box = added_blob_it.data_relative(-1)->bounding_box();
2458 if (dest_row->
blob_list()->singleton() ||
2460 block_skew = (1 - smooth_factor) * block_skew
2466 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2467 if (row_it.data()->blob_list()->empty())
2468 delete row_it.extract();
2489 float merge_top, merge_bottom;
2493 BLOBNBOX_IT blob_it;
2496 row = row_it->data ();
2497 bestover = top - bottom;
2498 if (top > row->
max_y ())
2499 bestover -= top - row->
max_y ();
2500 if (bottom < row->min_y ())
2502 bestover -= row->
min_y () - bottom;
2504 tprintf(
"Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f\n",
2505 bottom, top, row->
min_y(), row->
max_y(), rowsize, bestover);
2509 if (!row_it->at_last ()) {
2511 test_row = row_it->data ();
2512 if (test_row->
min_y () <= top && test_row->
max_y () >= bottom) {
2514 test_row->
max_y () >
2517 test_row->
min_y () <
2519 if (merge_top - merge_bottom <= rowsize) {
2521 tprintf (
"Merging rows at (%g,%g), (%g,%g)\n",
2525 test_row->
set_limits (merge_bottom, merge_top);
2526 blob_it.set_to_list (test_row->
blob_list ());
2527 blob_it.add_list_after (row->
blob_list ());
2529 row_it->backward ();
2530 delete row_it->extract ();
2534 overlap = top - bottom;
2535 if (top > test_row->
max_y ())
2536 overlap -= top - test_row->
max_y ();
2537 if (bottom < test_row->min_y ())
2538 overlap -= test_row->
min_y () - bottom;
2539 if (bestover >= rowsize - 1 && overlap >= rowsize - 1) {
2542 if (overlap > bestover) {
2547 tprintf(
"Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f->%f\n",
2548 bottom, top, test_row->
min_y(), test_row->
max_y(),
2549 rowsize, overlap, bestover);
2554 while (!row_it->at_last ()
2555 && test_row->
min_y () <= top && test_row->
max_y () >= bottom);
2556 while (row_it->data () != row)
2557 row_it->backward ();
2559 if (top - bottom - bestover > rowsize * textord_overlap_x &&
2575 const void *item2) {
2577 const BLOBNBOX *blob1 = *reinterpret_cast<const BLOBNBOX* const*>(item1);
2579 const BLOBNBOX *blob2 = *reinterpret_cast<const BLOBNBOX* const*>(item2);
2597 const void *item2) {
2599 const TO_ROW *row1 = *reinterpret_cast<const TO_ROW* const*>(item1);
2601 const TO_ROW *row2 = *reinterpret_cast<const TO_ROW* const*>(item2);
2619 const void *item2) {
2621 const TO_ROW *row1 = *reinterpret_cast<const TO_ROW* const*>(item1);
2623 const TO_ROW *row2 = *reinterpret_cast<const TO_ROW* const*>(item2);
2641 int num_repeated_sets = 0;
2642 if (!box_it.empty()) {
2645 int repeat_length = 1;
2648 BLOBNBOX_IT test_it(box_it);
2649 for (test_it.forward(); !test_it.at_first();) {
2650 bblob = test_it.data();
2654 bblob = test_it.data();
2663 num_repeated_sets++;
2664 for (; repeat_length > 0; box_it.forward(), --repeat_length) {
2665 bblob = box_it.data();
2672 }
while (!box_it.at_first());