tesseract  4.0.0-1-g2a2b
topitch.cpp File Reference
#include "blobbox.h"
#include "statistc.h"
#include "drawtord.h"
#include "makerow.h"
#include "pitsync1.h"
#include "pithsync.h"
#include "tovars.h"
#include "wordseg.h"
#include "topitch.h"
#include "helpers.h"
#include <memory>

Go to the source code of this file.

Macros

#define EXTERN
 
#define FIXED_WIDTH_MULTIPLE   5
 
#define BLOCK_STATS_CLUSTERS   10
 
#define MAX_ALLOWED_PITCH   100
 

Functions

void compute_fixed_pitch (ICOORD page_tr, TO_BLOCK_LIST *port_blocks, float gradient, FCOORD rotation, bool testing_on)
 
void fix_row_pitch (TO_ROW *bad_row, TO_BLOCK *bad_block, TO_BLOCK_LIST *blocks, int32_t row_target, int32_t block_target)
 
void compute_block_pitch (TO_BLOCK *block, FCOORD rotation, int32_t block_index, bool testing_on)
 
bool compute_rows_pitch (TO_BLOCK *block, int32_t block_index, bool testing_on)
 
bool try_doc_fixed (ICOORD page_tr, TO_BLOCK_LIST *port_blocks, float gradient)
 
bool try_block_fixed (TO_BLOCK *block, int32_t block_index)
 
bool try_rows_fixed (TO_BLOCK *block, int32_t block_index, bool testing_on)
 
void print_block_counts (TO_BLOCK *block, int32_t block_index)
 
void count_block_votes (TO_BLOCK *block, int32_t &def_fixed, int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed, int32_t &corr_prop, int32_t &dunno)
 
bool row_pitch_stats (TO_ROW *row, int32_t maxwidth, bool testing_on)
 
bool find_row_pitch (TO_ROW *row, int32_t maxwidth, int32_t dm_gap, TO_BLOCK *block, int32_t block_index, int32_t row_index, bool testing_on)
 
bool fixed_pitch_row (TO_ROW *row, BLOCK *block, int32_t block_index)
 
bool count_pitch_stats (TO_ROW *row, STATS *gap_stats, STATS *pitch_stats, float initial_pitch, float min_space, bool ignore_outsize, bool split_outsize, int32_t dm_gap)
 
float tune_row_pitch (TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float &initial_pitch, float &best_sp_sd, int16_t &best_mid_cuts, ICOORDELT_LIST *best_cells, bool testing_on)
 
float tune_row_pitch2 (TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float &initial_pitch, float &best_sp_sd, int16_t &best_mid_cuts, ICOORDELT_LIST *best_cells, bool testing_on)
 
float compute_pitch_sd (TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float initial_pitch, float &sp_sd, int16_t &mid_cuts, ICOORDELT_LIST *row_cells, bool testing_on, int16_t start, int16_t end)
 
float compute_pitch_sd2 (TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float initial_pitch, int16_t &occupation, int16_t &mid_cuts, ICOORDELT_LIST *row_cells, bool testing_on, int16_t start, int16_t end)
 
void print_pitch_sd (TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float initial_pitch)
 
void find_repeated_chars (TO_BLOCK *block, bool testing_on)
 
void plot_fp_word (TO_BLOCK *block, float pitch, float nonspace)
 

Variables

EXTERN bool textord_all_prop = FALSE
 
EXTERN bool textord_debug_pitch_test = FALSE
 
EXTERN bool textord_disable_pitch_test = FALSE
 
EXTERN bool textord_fast_pitch_test = FALSE
 
EXTERN bool textord_debug_pitch_metric = FALSE
 
EXTERN bool textord_show_row_cuts = FALSE
 
EXTERN bool textord_show_page_cuts = FALSE
 
EXTERN bool textord_pitch_cheat = FALSE
 
EXTERN bool textord_blockndoc_fixed = FALSE
 
EXTERN double textord_projection_scale = 0.200
 
EXTERN double textord_balance_factor = 1.0
 

Macro Definition Documentation

◆ BLOCK_STATS_CLUSTERS

#define BLOCK_STATS_CLUSTERS   10

Definition at line 60 of file topitch.cpp.

◆ EXTERN

#define EXTERN

Definition at line 38 of file topitch.cpp.

◆ FIXED_WIDTH_MULTIPLE

#define FIXED_WIDTH_MULTIPLE   5

Definition at line 59 of file topitch.cpp.

◆ MAX_ALLOWED_PITCH

#define MAX_ALLOWED_PITCH   100

Definition at line 61 of file topitch.cpp.

Function Documentation

◆ compute_block_pitch()

void compute_block_pitch ( TO_BLOCK block,
FCOORD  rotation,
int32_t  block_index,
bool  testing_on 
)

Definition at line 318 of file topitch.cpp.

321  { // correct orientation
322  TBOX block_box; //bounding box
323 
324  block_box = block->block->pdblk.bounding_box ();
325  if (testing_on && textord_debug_pitch_test) {
326  tprintf ("Block %d at (%d,%d)->(%d,%d)\n",
327  block_index,
328  block_box.left (), block_box.bottom (),
329  block_box.right (), block_box.top ());
330  }
331  block->min_space = (int32_t) floor (block->xheight
333  block->max_nonspace = (int32_t) ceil (block->xheight
335  block->fixed_pitch = 0.0f;
336  block->space_size = (float) block->min_space;
337  block->kern_size = (float) block->max_nonspace;
338  block->pr_nonsp = block->xheight * words_default_prop_nonspace;
340  if (!block->get_rows ()->empty ()) {
341  ASSERT_HOST (block->xheight > 0);
342  find_repeated_chars(block, textord_show_initial_words && testing_on);
343 #ifndef GRAPHICS_DISABLED
344  if (textord_show_initial_words && testing_on)
345  //overlap_picture_ops(TRUE);
347 #endif
348  compute_rows_pitch(block,
349  block_index,
350  textord_debug_pitch_test && testing_on);
351  }
352 }
bool compute_rows_pitch(TO_BLOCK *block, int32_t block_index, bool testing_on)
Definition: topitch.cpp:361
float fixed_pitch
Definition: blobbox.h:802
float kern_size
Definition: blobbox.h:803
Definition: rect.h:34
EXTERN double textord_words_default_nonspace
Definition: tovars.cpp:51
TO_ROW_LIST * get_rows()
Definition: blobbox.h:717
static void Update()
Definition: scrollview.cpp:711
int32_t max_nonspace
Definition: blobbox.h:806
EXTERN double textord_spacesize_ratioprop
Definition: tovars.cpp:80
void find_repeated_chars(TO_BLOCK *block, bool testing_on)
Definition: topitch.cpp:1763
float space_size
Definition: blobbox.h:804
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
float xheight
Definition: blobbox.h:801
int32_t min_space
Definition: blobbox.h:805
EXTERN double words_default_prop_nonspace
Definition: tovars.cpp:72
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
float pr_nonsp
Definition: blobbox.h:810
BLOCK * block
Definition: blobbox.h:790
EXTERN bool textord_show_initial_words
Definition: tovars.cpp:25
float pr_space
Definition: blobbox.h:809
EXTERN double textord_words_default_minspace
Definition: tovars.cpp:48
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:60
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
PDBLK pdblk
Definition: ocrblock.h:192
EXTERN bool textord_debug_pitch_test
Definition: topitch.cpp:42
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ compute_fixed_pitch()

void compute_fixed_pitch ( ICOORD  page_tr,
TO_BLOCK_LIST *  port_blocks,
float  gradient,
FCOORD  rotation,
bool  testing_on 
)

Definition at line 84 of file topitch.cpp.

88  { // correct orientation
89  TO_BLOCK_IT block_it; //iterator
90  TO_BLOCK *block; //current block;
91  TO_ROW *row; //current row
92  int block_index; //block number
93  int row_index; //row number
94 
95 #ifndef GRAPHICS_DISABLED
96  if (textord_show_initial_words && testing_on) {
97  if (to_win == nullptr)
98  create_to_win(page_tr);
99  }
100 #endif
101 
102  block_it.set_to_list (port_blocks);
103  block_index = 1;
104  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
105  block_it.forward ()) {
106  block = block_it.data ();
107  compute_block_pitch(block, rotation, block_index, testing_on);
108  block_index++;
109  }
110 
111  if (!try_doc_fixed (page_tr, port_blocks, gradient)) {
112  block_index = 1;
113  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
114  block_it.forward ()) {
115  block = block_it.data ();
116  if (!try_block_fixed (block, block_index))
117  try_rows_fixed(block, block_index, testing_on);
118  block_index++;
119  }
120  }
121 
122  block_index = 1;
123  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
124  block_it.forward()) {
125  block = block_it.data ();
126  POLY_BLOCK* pb = block->block->pdblk.poly_block();
127  if (pb != nullptr && !pb->IsText()) continue; // Non-text doesn't exist!
128  // row iterator
129  TO_ROW_IT row_it(block->get_rows());
130  row_index = 1;
131  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
132  row = row_it.data ();
133  fix_row_pitch(row, block, port_blocks, row_index, block_index);
134  row_index++;
135  }
136  block_index++;
137  }
138 #ifndef GRAPHICS_DISABLED
139  if (textord_show_initial_words && testing_on) {
141  }
142 #endif
143 }
bool try_block_fixed(TO_BLOCK *block, int32_t block_index)
Definition: topitch.cpp:545
bool try_rows_fixed(TO_BLOCK *block, int32_t block_index, bool testing_on)
Definition: topitch.cpp:559
TO_ROW_LIST * get_rows()
Definition: blobbox.h:717
static void Update()
Definition: scrollview.cpp:711
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
bool IsText() const
Definition: polyblk.h:49
EXTERN ScrollView * to_win
Definition: drawtord.cpp:37
BLOCK * block
Definition: blobbox.h:790
EXTERN bool textord_show_initial_words
Definition: tovars.cpp:25
void fix_row_pitch(TO_ROW *bad_row, TO_BLOCK *bad_block, TO_BLOCK_LIST *blocks, int32_t row_target, int32_t block_target)
Definition: topitch.cpp:153
ScrollView * create_to_win(ICOORD page_tr)
Definition: drawtord.cpp:46
bool try_doc_fixed(ICOORD page_tr, TO_BLOCK_LIST *port_blocks, float gradient)
Definition: topitch.cpp:405
PDBLK pdblk
Definition: ocrblock.h:192
void compute_block_pitch(TO_BLOCK *block, FCOORD rotation, int32_t block_index, bool testing_on)
Definition: topitch.cpp:318

◆ compute_pitch_sd()

float compute_pitch_sd ( TO_ROW row,
STATS projection,
int16_t  projection_left,
int16_t  projection_right,
float  space_size,
float  initial_pitch,
float &  sp_sd,
int16_t &  mid_cuts,
ICOORDELT_LIST *  row_cells,
bool  testing_on,
int16_t  start,
int16_t  end 
)

Definition at line 1380 of file topitch.cpp.

1393  {
1394  int16_t occupation; //no of cells in word.
1395  //blobs
1396  BLOBNBOX_IT blob_it = row->blob_list ();
1397  BLOBNBOX_IT start_it; //start of word
1398  BLOBNBOX_IT plot_it; //for plotting
1399  int16_t blob_count; //no of blobs
1400  TBOX blob_box; //bounding box
1401  TBOX prev_box; //of super blob
1402  int32_t prev_right; //of word sync
1403  int scale_factor; //on scores for big words
1404  int32_t sp_count; //spaces
1405  FPSEGPT_LIST seg_list; //char cells
1406  FPSEGPT_IT seg_it; //iterator
1407  int16_t segpos; //position of segment
1408  int16_t cellpos; //previous cell boundary
1409  //iterator
1410  ICOORDELT_IT cell_it = row_cells;
1411  ICOORDELT *cell; //new cell
1412  double sqsum; //sum of squares
1413  double spsum; //of spaces
1414  double sp_var; //space error
1415  double word_sync; //result for word
1416  int32_t total_count; //total blobs
1417 
1418  if ((pitsync_linear_version & 3) > 1) {
1419  word_sync = compute_pitch_sd2 (row, projection, projection_left,
1420  projection_right, initial_pitch,
1421  occupation, mid_cuts, row_cells,
1422  testing_on, start, end);
1423  sp_sd = occupation;
1424  return word_sync;
1425  }
1426  mid_cuts = 0;
1427  cellpos = 0;
1428  total_count = 0;
1429  sqsum = 0;
1430  sp_count = 0;
1431  spsum = 0;
1432  prev_right = -1;
1433  if (blob_it.empty ())
1434  return space_size * 10;
1435 #ifndef GRAPHICS_DISABLED
1436  if (testing_on && to_win != nullptr) {
1437  blob_box = blob_it.data ()->bounding_box ();
1438  projection->plot (to_win, projection_left,
1439  row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
1440  }
1441 #endif
1442  start_it = blob_it;
1443  blob_count = 0;
1444  blob_box = box_next (&blob_it);//first blob
1445  blob_it.mark_cycle_pt ();
1446  do {
1447  for (; blob_count > 0; blob_count--)
1448  box_next(&start_it);
1449  do {
1450  prev_box = blob_box;
1451  blob_count++;
1452  blob_box = box_next (&blob_it);
1453  }
1454  while (!blob_it.cycled_list ()
1455  && blob_box.left () - prev_box.right () < space_size);
1456  plot_it = start_it;
1457  if (pitsync_linear_version & 3)
1458  word_sync =
1459  check_pitch_sync2 (&start_it, blob_count, (int16_t) initial_pitch, 2,
1460  projection, projection_left, projection_right,
1462  occupation, &seg_list, start, end);
1463  else
1464  word_sync =
1465  check_pitch_sync (&start_it, blob_count, (int16_t) initial_pitch, 2,
1466  projection, &seg_list);
1467  if (testing_on) {
1468  tprintf ("Word ending at (%d,%d), len=%d, sync rating=%g, ",
1469  prev_box.right (), prev_box.top (),
1470  seg_list.length () - 1, word_sync);
1471  seg_it.set_to_list (&seg_list);
1472  for (seg_it.mark_cycle_pt (); !seg_it.cycled_list ();
1473  seg_it.forward ()) {
1474  if (seg_it.data ()->faked)
1475  tprintf ("(F)");
1476  tprintf ("%d, ", seg_it.data ()->position ());
1477  // tprintf("C=%g, s=%g, sq=%g\n",
1478  // seg_it.data()->cost_function(),
1479  // seg_it.data()->sum(),
1480  // seg_it.data()->squares());
1481  }
1482  tprintf ("\n");
1483  }
1484 #ifndef GRAPHICS_DISABLED
1485  if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr)
1486  plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1487 #endif
1488  seg_it.set_to_list (&seg_list);
1489  if (prev_right >= 0) {
1490  sp_var = seg_it.data ()->position () - prev_right;
1491  sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
1492  sp_var *= sp_var;
1493  spsum += sp_var;
1494  sp_count++;
1495  }
1496  for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
1497  segpos = seg_it.data ()->position ();
1498  if (cell_it.empty () || segpos > cellpos + initial_pitch / 2) {
1499  //big gap
1500  while (!cell_it.empty () && segpos > cellpos + initial_pitch * 3 / 2) {
1501  cell = new ICOORDELT (cellpos + (int16_t) initial_pitch, 0);
1502  cell_it.add_after_then_move (cell);
1503  cellpos += (int16_t) initial_pitch;
1504  }
1505  //make new one
1506  cell = new ICOORDELT (segpos, 0);
1507  cell_it.add_after_then_move (cell);
1508  cellpos = segpos;
1509  }
1510  else if (segpos > cellpos - initial_pitch / 2) {
1511  cell = cell_it.data ();
1512  //average positions
1513  cell->set_x ((cellpos + segpos) / 2);
1514  cellpos = cell->x ();
1515  }
1516  }
1517  seg_it.move_to_last ();
1518  prev_right = seg_it.data ()->position ();
1520  scale_factor = (seg_list.length () - 2) / 2;
1521  if (scale_factor < 1)
1522  scale_factor = 1;
1523  }
1524  else
1525  scale_factor = 1;
1526  sqsum += word_sync * scale_factor;
1527  total_count += (seg_list.length () - 1) * scale_factor;
1528  seg_list.clear ();
1529  }
1530  while (!blob_it.cycled_list ());
1531  sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
1532  return total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
1533 }
float intercept() const
Definition: blobbox.h:601
void set_x(int16_t xin)
rewrite function
Definition: points.h:62
Definition: rect.h:34
double check_pitch_sync(BLOBNBOX_IT *blob_it, int16_t blob_count, int16_t pitch, int16_t pitch_error, STATS *projection, FPSEGPT_LIST *seg_list)
Definition: pitsync1.cpp:144
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:637
EXTERN double textord_projection_scale
Definition: topitch.cpp:55
float xheight
Definition: blobbox.h:670
float compute_pitch_sd2(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float initial_pitch, int16_t &occupation, int16_t &mid_cuts, ICOORDELT_LIST *row_cells, bool testing_on, int16_t start, int16_t end)
Definition: topitch.cpp:1543
double check_pitch_sync2(BLOBNBOX_IT *blob_it, int16_t blob_count, int16_t pitch, int16_t pitch_error, STATS *projection, int16_t projection_left, int16_t projection_right, float projection_scale, int16_t &occupation_count, FPSEGPT_LIST *seg_list, int16_t start, int16_t end)
Definition: pithsync.cpp:294
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
EXTERN bool textord_pitch_scalebigwords
Definition: tovars.cpp:69
int16_t x() const
access function
Definition: points.h:53
EXTERN bool textord_show_fixed_cuts
Definition: drawtord.cpp:35
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
EXTERN ScrollView * to_win
Definition: drawtord.cpp:37
int16_t right() const
Definition: rect.h:79
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:584
void plot_fp_cells2(ScrollView *win, ScrollView::Color colour, TO_ROW *row, FPSEGPT_LIST *seg_list)
Definition: drawtord.cpp:363
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:612

◆ compute_pitch_sd2()

float compute_pitch_sd2 ( TO_ROW row,
STATS projection,
int16_t  projection_left,
int16_t  projection_right,
float  initial_pitch,
int16_t &  occupation,
int16_t &  mid_cuts,
ICOORDELT_LIST *  row_cells,
bool  testing_on,
int16_t  start,
int16_t  end 
)

Definition at line 1543 of file topitch.cpp.

1555  {
1556  //blobs
1557  BLOBNBOX_IT blob_it = row->blob_list ();
1558  BLOBNBOX_IT plot_it;
1559  int16_t blob_count; //no of blobs
1560  TBOX blob_box; //bounding box
1561  FPSEGPT_LIST seg_list; //char cells
1562  FPSEGPT_IT seg_it; //iterator
1563  int16_t segpos; //position of segment
1564  //iterator
1565  ICOORDELT_IT cell_it = row_cells;
1566  ICOORDELT *cell; //new cell
1567  double word_sync; //result for word
1568 
1569  mid_cuts = 0;
1570  if (blob_it.empty ()) {
1571  occupation = 0;
1572  return initial_pitch * 10;
1573  }
1574 #ifndef GRAPHICS_DISABLED
1575  if (testing_on && to_win != nullptr) {
1576  projection->plot (to_win, projection_left,
1577  row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
1578  }
1579 #endif
1580  blob_count = 0;
1581  blob_it.mark_cycle_pt ();
1582  do {
1583  //first blob
1584  blob_box = box_next (&blob_it);
1585  blob_count++;
1586  }
1587  while (!blob_it.cycled_list ());
1588  plot_it = blob_it;
1589  word_sync = check_pitch_sync2 (&blob_it, blob_count, (int16_t) initial_pitch,
1590  2, projection, projection_left,
1591  projection_right,
1593  occupation, &seg_list, start, end);
1594  if (testing_on) {
1595  tprintf ("Row ending at (%d,%d), len=%d, sync rating=%g, ",
1596  blob_box.right (), blob_box.top (),
1597  seg_list.length () - 1, word_sync);
1598  seg_it.set_to_list (&seg_list);
1599  for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
1600  if (seg_it.data ()->faked)
1601  tprintf ("(F)");
1602  tprintf ("%d, ", seg_it.data ()->position ());
1603  // tprintf("C=%g, s=%g, sq=%g\n",
1604  // seg_it.data()->cost_function(),
1605  // seg_it.data()->sum(),
1606  // seg_it.data()->squares());
1607  }
1608  tprintf ("\n");
1609  }
1610 #ifndef GRAPHICS_DISABLED
1611  if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr)
1612  plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1613 #endif
1614  seg_it.set_to_list (&seg_list);
1615  for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
1616  segpos = seg_it.data ()->position ();
1617  //make new one
1618  cell = new ICOORDELT (segpos, 0);
1619  cell_it.add_after_then_move (cell);
1620  if (seg_it.at_last ())
1621  mid_cuts = seg_it.data ()->cheap_cuts ();
1622  }
1623  seg_list.clear ();
1624  return occupation > 0 ? sqrt (word_sync / occupation) : initial_pitch * 10;
1625 }
float intercept() const
Definition: blobbox.h:601
Definition: rect.h:34
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:637
EXTERN double textord_projection_scale
Definition: topitch.cpp:55
float xheight
Definition: blobbox.h:670
double check_pitch_sync2(BLOBNBOX_IT *blob_it, int16_t blob_count, int16_t pitch, int16_t pitch_error, STATS *projection, int16_t projection_left, int16_t projection_right, float projection_scale, int16_t &occupation_count, FPSEGPT_LIST *seg_list, int16_t start, int16_t end)
Definition: pithsync.cpp:294
int16_t top() const
Definition: rect.h:58
EXTERN bool textord_show_fixed_cuts
Definition: drawtord.cpp:35
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
EXTERN ScrollView * to_win
Definition: drawtord.cpp:37
int16_t right() const
Definition: rect.h:79
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:584
void plot_fp_cells2(ScrollView *win, ScrollView::Color colour, TO_ROW *row, FPSEGPT_LIST *seg_list)
Definition: drawtord.cpp:363
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:612

◆ compute_rows_pitch()

bool compute_rows_pitch ( TO_BLOCK block,
int32_t  block_index,
bool  testing_on 
)

Definition at line 361 of file topitch.cpp.

365  {
366  int32_t maxwidth; //of spaces
367  TO_ROW *row; //current row
368  int32_t row_index; //row number.
369  float lower, upper; //cluster thresholds
370  TO_ROW_IT row_it = block->get_rows ();
371 
372  row_index = 1;
373  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
374  row = row_it.data ();
375  ASSERT_HOST (row->xheight > 0);
377  maxwidth = (int32_t) ceil (row->xheight * textord_words_maxspace);
378  if (row_pitch_stats (row, maxwidth, testing_on)
379  && find_row_pitch (row, maxwidth,
380  textord_dotmatrix_gap + 1, block, block_index,
381  row_index, testing_on)) {
382  if (row->fixed_pitch == 0) {
383  lower = row->pr_nonsp;
384  upper = row->pr_space;
385  row->space_size = upper;
386  row->kern_size = lower;
387  }
388  }
389  else {
390  row->fixed_pitch = 0.0f; //insufficient data
392  }
393  row_index++;
394  }
395  return false;
396 }
bool row_pitch_stats(TO_ROW *row, int32_t maxwidth, bool testing_on)
Definition: topitch.cpp:711
void compute_vertical_projection()
Definition: blobbox.cpp:797
float fixed_pitch
Definition: blobbox.h:664
float space_size
Definition: blobbox.h:680
TO_ROW_LIST * get_rows()
Definition: blobbox.h:717
float pr_nonsp
Definition: blobbox.h:668
bool find_row_pitch(TO_ROW *row, int32_t maxwidth, int32_t dm_gap, TO_BLOCK *block, int32_t block_index, int32_t row_index, bool testing_on)
Definition: topitch.cpp:844
float xheight
Definition: blobbox.h:670
float pr_space
Definition: blobbox.h:667
EXTERN double textord_words_maxspace
Definition: tovars.cpp:44
float kern_size
Definition: blobbox.h:679
EXTERN int textord_dotmatrix_gap
Definition: tovars.cpp:35
#define ASSERT_HOST(x)
Definition: errcode.h:84
PITCH_TYPE pitch_decision
Definition: blobbox.h:663

◆ count_block_votes()

void count_block_votes ( TO_BLOCK block,
int32_t &  def_fixed,
int32_t &  def_prop,
int32_t &  maybe_fixed,
int32_t &  maybe_prop,
int32_t &  corr_fixed,
int32_t &  corr_prop,
int32_t &  dunno 
)

Definition at line 664 of file topitch.cpp.

672  {
673  TO_ROW *row; //current row
674  TO_ROW_IT row_it = block->get_rows ();
675 
676  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
677  row = row_it.data ();
678  switch (row->pitch_decision) {
679  case PITCH_DUNNO:
680  dunno++;
681  break;
682  case PITCH_DEF_PROP:
683  def_prop++;
684  break;
685  case PITCH_MAYBE_PROP:
686  maybe_prop++;
687  break;
688  case PITCH_DEF_FIXED:
689  def_fixed++;
690  break;
691  case PITCH_MAYBE_FIXED:
692  maybe_fixed++;
693  break;
694  case PITCH_CORR_PROP:
695  corr_prop++;
696  break;
697  case PITCH_CORR_FIXED:
698  corr_fixed++;
699  break;
700  }
701  }
702 }
TO_ROW_LIST * get_rows()
Definition: blobbox.h:717
PITCH_TYPE pitch_decision
Definition: blobbox.h:663

◆ count_pitch_stats()

bool count_pitch_stats ( TO_ROW row,
STATS gap_stats,
STATS pitch_stats,
float  initial_pitch,
float  min_space,
bool  ignore_outsize,
bool  split_outsize,
int32_t  dm_gap 
)

Definition at line 1068 of file topitch.cpp.

1077  {
1078  bool prev_valid; //not word broken
1079  BLOBNBOX *blob; //current blob
1080  //blobs
1081  BLOBNBOX_IT blob_it = row->blob_list ();
1082  int32_t prev_right; //end of prev blob
1083  int32_t prev_centre; //centre of previous blob
1084  int32_t x_centre; //centre of this blob
1085  int32_t blob_width; //width of blob
1086  int32_t width_units; //no of widths in blob
1087  float width; //blob width
1088  TBOX blob_box; //bounding box
1089  TBOX joined_box; //of super blob
1090 
1091  gap_stats->clear ();
1092  pitch_stats->clear ();
1093  if (blob_it.empty ())
1094  return false;
1095  prev_valid = false;
1096  prev_centre = 0;
1097  prev_right = 0; // stop compiler warning
1098  joined_box = blob_it.data ()->bounding_box ();
1099  do {
1100  blob_it.forward ();
1101  blob = blob_it.data ();
1102  if (!blob->joined_to_prev ()) {
1103  blob_box = blob->bounding_box ();
1104  if ((blob_box.left () - joined_box.right () < dm_gap
1105  && !blob_it.at_first ())
1106  || blob->cblob() == nullptr)
1107  joined_box += blob_box; //merge blobs
1108  else {
1109  blob_width = joined_box.width ();
1110  if (split_outsize) {
1111  width_units =
1112  (int32_t) floor ((float) blob_width / initial_pitch + 0.5);
1113  if (width_units < 1)
1114  width_units = 1;
1115  width_units--;
1116  }
1117  else if (ignore_outsize) {
1118  width = (float) blob_width / initial_pitch;
1119  width_units = width < 1 + words_default_fixed_limit
1120  && width > 1 - words_default_fixed_limit ? 0 : -1;
1121  }
1122  else
1123  width_units = 0; //everything in
1124  x_centre = (int32_t) (joined_box.left ()
1125  + (blob_width -
1126  width_units * initial_pitch) / 2);
1127  if (prev_valid && width_units >= 0) {
1128  // if (width_units>0)
1129  // {
1130  // tprintf("wu=%d, width=%d, xc=%d, adding %d\n",
1131  // width_units,blob_width,x_centre,x_centre-prev_centre);
1132  // }
1133  gap_stats->add (joined_box.left () - prev_right, 1);
1134  pitch_stats->add (x_centre - prev_centre, 1);
1135  }
1136  prev_centre = (int32_t) (x_centre + width_units * initial_pitch);
1137  prev_right = joined_box.right ();
1138  prev_valid = blob_box.left () - joined_box.right () < min_space;
1139  prev_valid = prev_valid && width_units >= 0;
1140  joined_box = blob_box;
1141  }
1142  }
1143  }
1144  while (!blob_it.at_first ());
1145  return gap_stats->get_total () >= 3;
1146 }
EXTERN double words_default_fixed_limit
Definition: tovars.cpp:74
void clear()
Definition: statistc.cpp:82
Definition: rect.h:34
int16_t width() const
Definition: rect.h:115
int16_t left() const
Definition: rect.h:72
bool joined_to_prev() const
Definition: blobbox.h:257
void add(int32_t value, int32_t count)
Definition: statistc.cpp:100
const TBOX & bounding_box() const
Definition: blobbox.h:231
int16_t right() const
Definition: rect.h:79
C_BLOB * cblob() const
Definition: blobbox.h:269
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:612
int32_t get_total() const
Definition: statistc.h:86

◆ find_repeated_chars()

void find_repeated_chars ( TO_BLOCK block,
bool  testing_on 
)

Definition at line 1763 of file topitch.cpp.

1764  { // Debug mode.
1765  POLY_BLOCK* pb = block->block->pdblk.poly_block();
1766  if (pb != nullptr && !pb->IsText())
1767  return; // Don't find repeated chars in non-text blocks.
1768 
1769  TO_ROW *row;
1770  BLOBNBOX_IT box_it;
1771  BLOBNBOX_IT search_it; // forward search
1772  WERD *word; // new word
1773  TBOX word_box; // for plotting
1774  int blobcount, repeated_set;
1775 
1776  TO_ROW_IT row_it = block->get_rows();
1777  if (row_it.empty()) return; // empty block
1778  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1779  row = row_it.data();
1780  box_it.set_to_list(row->blob_list());
1781  if (box_it.empty()) continue; // no blobs in this row
1782  if (!row->rep_chars_marked()) {
1783  mark_repeated_chars(row);
1784  }
1785  if (row->num_repeated_sets() == 0) continue; // nothing to do for this row
1786  // new words
1787  WERD_IT word_it(&row->rep_words);
1788  do {
1789  if (box_it.data()->repeated_set() != 0 &&
1790  !box_it.data()->joined_to_prev()) {
1791  blobcount = 1;
1792  repeated_set = box_it.data()->repeated_set();
1793  search_it = box_it;
1794  search_it.forward();
1795  while (!search_it.at_first() &&
1796  search_it.data()->repeated_set() == repeated_set) {
1797  blobcount++;
1798  search_it.forward();
1799  }
1800  // After the call to make_real_word() all the blobs from this
1801  // repeated set will be removed from the blob list. box_it will be
1802  // set to point to the blob after the end of the extracted sequence.
1803  word = make_real_word(&box_it, blobcount, box_it.at_first(), 1);
1804  if (!box_it.empty() && box_it.data()->joined_to_prev()) {
1805  tprintf("Bad box joined to prev at");
1806  box_it.data()->bounding_box().print();
1807  tprintf("After repeated word:");
1808  word->bounding_box().print();
1809  }
1810  ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev());
1811  word->set_flag(W_REP_CHAR, true);
1812  word->set_flag(W_DONT_CHOP, true);
1813  word_it.add_after_then_move(word);
1814  } else {
1815  box_it.forward();
1816  }
1817  } while (!box_it.at_first());
1818  }
1819 }
WERD * make_real_word(BLOBNBOX_IT *box_it, int32_t blobcount, bool bol, uint8_t blanks)
Definition: wordseg.cpp:583
void print() const
Definition: rect.h:278
int num_repeated_sets() const
Definition: blobbox.h:650
TBOX bounding_box() const
Definition: werd.cpp:159
WERD_LIST rep_words
Definition: blobbox.h:681
Definition: rect.h:34
void mark_repeated_chars(TO_ROW *row)
Definition: makerow.cpp:2641
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:127
TO_ROW_LIST * get_rows()
Definition: blobbox.h:717
bool rep_chars_marked() const
Definition: blobbox.h:644
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
Definition: werd.h:59
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool IsText() const
Definition: polyblk.h:49
BLOCK * block
Definition: blobbox.h:790
PDBLK pdblk
Definition: ocrblock.h:192
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:612
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ find_row_pitch()

bool find_row_pitch ( TO_ROW row,
int32_t  maxwidth,
int32_t  dm_gap,
TO_BLOCK block,
int32_t  block_index,
int32_t  row_index,
bool  testing_on 
)

Definition at line 844 of file topitch.cpp.

852  {
853  bool used_dm_model; //looks like dot matrix
854  float min_space; //estimate threshold
855  float non_space; //gap size
856  float gap_iqr; //interquartile range
857  float pitch_iqr;
858  float dm_gap_iqr; //interquartile range
859  float dm_pitch_iqr;
860  float dm_pitch; //pitch with dm on
861  float pitch; //revised estimate
862  float initial_pitch; //guess at pitch
863  STATS gap_stats (0, maxwidth);
864  //centre-centre
865  STATS pitch_stats (0, maxwidth);
866 
867  row->fixed_pitch = 0.0f;
868  initial_pitch = row->fp_space;
869  if (initial_pitch > row->xheight * (1 + words_default_fixed_limit))
870  initial_pitch = row->xheight;//keep pitch decent
871  non_space = row->fp_nonsp;
872  if (non_space > initial_pitch)
873  non_space = initial_pitch;
874  min_space = (initial_pitch + non_space) / 2;
875 
876  if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
877  initial_pitch, min_space, TRUE, FALSE, dm_gap)) {
878  dm_gap_iqr = 0.0001;
879  dm_pitch_iqr = maxwidth * 2.0f;
880  dm_pitch = initial_pitch;
881  }
882  else {
883  dm_gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
884  dm_pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
885  dm_pitch = pitch_stats.ile (0.5);
886  }
887  gap_stats.clear ();
888  pitch_stats.clear ();
889  if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
890  initial_pitch, min_space, TRUE, FALSE, 0)) {
891  gap_iqr = 0.0001;
892  pitch_iqr = maxwidth * 3.0f;
893  }
894  else {
895  gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
896  pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
897  if (testing_on)
898  tprintf
899  ("First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
900  initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
901  initial_pitch = pitch_stats.ile (0.5);
902  if (min_space > initial_pitch
903  && count_pitch_stats (row, &gap_stats, &pitch_stats,
904  initial_pitch, initial_pitch, TRUE, FALSE, 0)) {
905  min_space = initial_pitch;
906  gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
907  pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
908  if (testing_on)
909  tprintf
910  ("Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
911  initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
912  initial_pitch = pitch_stats.ile (0.5);
913  }
914  }
916  tprintf("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:",
917  block_index, row_index, 'X',
918  pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr,
919  pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth ? 'D' :
920  (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M'));
921  if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) {
924  tprintf ("\n");
925  return false; //insufficient data
926  }
927  if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) {
928  if (testing_on)
929  tprintf
930  ("Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
931  pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
932  gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
933  pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
934  pitch = pitch_stats.ile (0.5);
935  used_dm_model = false;
936  }
937  else {
938  if (testing_on)
939  tprintf
940  ("Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
941  pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
942  gap_iqr = dm_gap_iqr;
943  pitch_iqr = dm_pitch_iqr;
944  pitch = dm_pitch;
945  used_dm_model = true;
946  }
948  tprintf ("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:",
949  pitch_iqr, gap_iqr, pitch);
950  tprintf ("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:",
951  pitch_iqr / gap_iqr, pitch_iqr / block->xheight,
952  pitch_iqr < gap_iqr * textord_fpiqr_ratio
953  && pitch_iqr < block->xheight * textord_max_pitch_iqr
954  && pitch < block->xheight * textord_words_default_maxspace
955  ? 'F' : 'P');
956  }
957  if (pitch_iqr < gap_iqr * textord_fpiqr_ratio
958  && pitch_iqr < block->xheight * textord_max_pitch_iqr
959  && pitch < block->xheight * textord_words_default_maxspace)
961  else
963  row->fixed_pitch = pitch;
964  row->kern_size = gap_stats.ile (0.5);
965  row->min_space = (int32_t) (row->fixed_pitch + non_space) / 2;
966  if (row->min_space > row->fixed_pitch)
967  row->min_space = (int32_t) row->fixed_pitch;
968  row->max_nonspace = row->min_space;
969  row->space_size = row->fixed_pitch;
970  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
971  row->used_dm_model = used_dm_model;
972  return true;
973 }
bool used_dm_model
Definition: blobbox.h:660
EXTERN double words_default_fixed_limit
Definition: tovars.cpp:74
#define TRUE
Definition: capi.h:51
float fixed_pitch
Definition: blobbox.h:664
EXTERN double textord_words_default_maxspace
Definition: tovars.cpp:46
EXTERN double textord_max_pitch_iqr
Definition: tovars.cpp:82
float space_size
Definition: blobbox.h:680
Definition: statistc.h:33
EXTERN bool textord_debug_pitch_metric
Definition: topitch.cpp:48
float xheight
Definition: blobbox.h:670
#define FALSE
Definition: capi.h:52
float fp_space
Definition: blobbox.h:665
float kern_size
Definition: blobbox.h:679
float xheight
Definition: blobbox.h:801
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int32_t space_threshold
Definition: blobbox.h:678
bool count_pitch_stats(TO_ROW *row, STATS *gap_stats, STATS *pitch_stats, float initial_pitch, float min_space, bool ignore_outsize, bool split_outsize, int32_t dm_gap)
Definition: topitch.cpp:1068
EXTERN double textord_fpiqr_ratio
Definition: tovars.cpp:81
int32_t min_space
Definition: blobbox.h:676
float fp_nonsp
Definition: blobbox.h:666
int32_t max_nonspace
Definition: blobbox.h:677
PITCH_TYPE pitch_decision
Definition: blobbox.h:663

◆ fix_row_pitch()

void fix_row_pitch ( TO_ROW bad_row,
TO_BLOCK bad_block,
TO_BLOCK_LIST *  blocks,
int32_t  row_target,
int32_t  block_target 
)

Definition at line 153 of file topitch.cpp.

157  { // number of block
158  int16_t mid_cuts;
159  int block_votes; //votes in block
160  int like_votes; //votes over page
161  int other_votes; //votes of unlike blocks
162  int block_index; //number of block
163  int row_index; //number of row
164  int maxwidth; //max pitch
165  TO_BLOCK_IT block_it = blocks; //block iterator
166  TO_BLOCK *block; //current block
167  TO_ROW *row; //current row
168  float sp_sd; //space deviation
169  STATS block_stats; //pitches in block
170  STATS like_stats; //pitches in page
171 
172  block_votes = like_votes = other_votes = 0;
173  maxwidth = (int32_t) ceil (bad_row->xheight * textord_words_maxspace);
174  if (bad_row->pitch_decision != PITCH_DEF_FIXED
175  && bad_row->pitch_decision != PITCH_DEF_PROP) {
176  block_stats.set_range (0, maxwidth);
177  like_stats.set_range (0, maxwidth);
178  block_index = 1;
179  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
180  block_it.forward()) {
181  block = block_it.data();
182  POLY_BLOCK* pb = block->block->pdblk.poly_block();
183  if (pb != nullptr && !pb->IsText()) continue; // Non text doesn't exist!
184  row_index = 1;
185  TO_ROW_IT row_it(block->get_rows());
186  for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
187  row_it.forward ()) {
188  row = row_it.data ();
189  if ((bad_row->all_caps
190  && row->xheight + row->ascrise
191  <
192  (bad_row->xheight + bad_row->ascrise) * (1 +
194  && row->xheight + row->ascrise >
195  (bad_row->xheight + bad_row->ascrise) * (1 -
197  || (!bad_row->all_caps
198  && row->xheight <
199  bad_row->xheight * (1 + textord_pitch_rowsimilarity)
200  && row->xheight >
201  bad_row->xheight * (1 - textord_pitch_rowsimilarity))) {
202  if (block_index == block_target) {
203  if (row->pitch_decision == PITCH_DEF_FIXED) {
204  block_votes += textord_words_veto_power;
205  block_stats.add ((int32_t) row->fixed_pitch,
207  }
208  else if (row->pitch_decision == PITCH_MAYBE_FIXED
209  || row->pitch_decision == PITCH_CORR_FIXED) {
210  block_votes++;
211  block_stats.add ((int32_t) row->fixed_pitch, 1);
212  }
213  else if (row->pitch_decision == PITCH_DEF_PROP)
214  block_votes -= textord_words_veto_power;
215  else if (row->pitch_decision == PITCH_MAYBE_PROP
216  || row->pitch_decision == PITCH_CORR_PROP)
217  block_votes--;
218  }
219  else {
220  if (row->pitch_decision == PITCH_DEF_FIXED) {
221  like_votes += textord_words_veto_power;
222  like_stats.add ((int32_t) row->fixed_pitch,
224  }
225  else if (row->pitch_decision == PITCH_MAYBE_FIXED
226  || row->pitch_decision == PITCH_CORR_FIXED) {
227  like_votes++;
228  like_stats.add ((int32_t) row->fixed_pitch, 1);
229  }
230  else if (row->pitch_decision == PITCH_DEF_PROP)
231  like_votes -= textord_words_veto_power;
232  else if (row->pitch_decision == PITCH_MAYBE_PROP
233  || row->pitch_decision == PITCH_CORR_PROP)
234  like_votes--;
235  }
236  }
237  else {
238  if (row->pitch_decision == PITCH_DEF_FIXED)
239  other_votes += textord_words_veto_power;
240  else if (row->pitch_decision == PITCH_MAYBE_FIXED
241  || row->pitch_decision == PITCH_CORR_FIXED)
242  other_votes++;
243  else if (row->pitch_decision == PITCH_DEF_PROP)
244  other_votes -= textord_words_veto_power;
245  else if (row->pitch_decision == PITCH_MAYBE_PROP
246  || row->pitch_decision == PITCH_CORR_PROP)
247  other_votes--;
248  }
249  row_index++;
250  }
251  block_index++;
252  }
253  if (block_votes > textord_words_veto_power) {
254  bad_row->fixed_pitch = block_stats.ile (0.5);
255  bad_row->pitch_decision = PITCH_CORR_FIXED;
256  }
257  else if (block_votes <= textord_words_veto_power && like_votes > 0) {
258  bad_row->fixed_pitch = like_stats.ile (0.5);
259  bad_row->pitch_decision = PITCH_CORR_FIXED;
260  }
261  else {
262  bad_row->pitch_decision = PITCH_CORR_PROP;
263  if (block_votes == 0 && like_votes == 0 && other_votes > 0
265  tprintf
266  ("Warning:row %d of block %d set prop with no like rows against trend\n",
267  row_target, block_target);
268  }
269  }
271  tprintf(":b_votes=%d:l_votes=%d:o_votes=%d",
272  block_votes, like_votes, other_votes);
273  tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise);
274  }
275  if (bad_row->pitch_decision == PITCH_CORR_FIXED) {
276  if (bad_row->fixed_pitch < textord_min_xheight) {
277  if (block_votes > 0)
278  bad_row->fixed_pitch = block_stats.ile (0.5);
279  else if (block_votes == 0 && like_votes > 0)
280  bad_row->fixed_pitch = like_stats.ile (0.5);
281  else {
282  tprintf
283  ("Warning:guessing pitch as xheight on row %d, block %d\n",
284  row_target, block_target);
285  bad_row->fixed_pitch = bad_row->xheight;
286  }
287  }
288  if (bad_row->fixed_pitch < textord_min_xheight)
289  bad_row->fixed_pitch = (float) textord_min_xheight;
290  bad_row->kern_size = bad_row->fixed_pitch / 4;
291  bad_row->min_space = (int32_t) (bad_row->fixed_pitch * 0.6);
292  bad_row->max_nonspace = (int32_t) (bad_row->fixed_pitch * 0.4);
293  bad_row->space_threshold =
294  (bad_row->min_space + bad_row->max_nonspace) / 2;
295  bad_row->space_size = bad_row->fixed_pitch;
296  if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) {
297  tune_row_pitch (bad_row, &bad_row->projection,
298  bad_row->projection_left, bad_row->projection_right,
299  (bad_row->fixed_pitch +
300  bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,
301  sp_sd, mid_cuts, &bad_row->char_cells, FALSE);
302  }
303  }
304  else if (bad_row->pitch_decision == PITCH_CORR_PROP
305  || bad_row->pitch_decision == PITCH_DEF_PROP) {
306  bad_row->fixed_pitch = 0.0f;
307  bad_row->char_cells.clear ();
308  }
309 }
EXTERN double textord_pitch_rowsimilarity
Definition: tovars.cpp:67
ICOORDELT_LIST char_cells
Definition: blobbox.h:682
float fixed_pitch
Definition: blobbox.h:664
int textord_min_xheight
Definition: makerow.cpp:68
int16_t projection_right
Definition: blobbox.h:662
float space_size
Definition: blobbox.h:680
Definition: statistc.h:33
EXTERN bool textord_debug_pitch_metric
Definition: topitch.cpp:48
TO_ROW_LIST * get_rows()
Definition: blobbox.h:717
float xheight
Definition: blobbox.h:670
EXTERN int textord_words_veto_power
Definition: tovars.cpp:65
#define FALSE
Definition: capi.h:52
EXTERN double textord_words_maxspace
Definition: tovars.cpp:44
STATS projection
Definition: blobbox.h:684
double ile(double frac) const
Definition: statistc.cpp:173
float kern_size
Definition: blobbox.h:679
bool set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1)
Definition: statistc.cpp:63
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int32_t space_threshold
Definition: blobbox.h:678
float ascrise
Definition: blobbox.h:672
bool IsText() const
Definition: polyblk.h:49
void add(int32_t value, int32_t count)
Definition: statistc.cpp:100
float tune_row_pitch(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float &initial_pitch, float &best_sp_sd, int16_t &best_mid_cuts, ICOORDELT_LIST *best_cells, bool testing_on)
Definition: topitch.cpp:1156
BLOCK * block
Definition: blobbox.h:790
int16_t projection_left
Definition: blobbox.h:661
int32_t min_space
Definition: blobbox.h:676
bool all_caps
Definition: blobbox.h:659
int32_t max_nonspace
Definition: blobbox.h:677
PDBLK pdblk
Definition: ocrblock.h:192
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:612
EXTERN bool textord_debug_pitch_test
Definition: topitch.cpp:42
PITCH_TYPE pitch_decision
Definition: blobbox.h:663

◆ fixed_pitch_row()

bool fixed_pitch_row ( TO_ROW row,
BLOCK block,
int32_t  block_index 
)

Definition at line 984 of file topitch.cpp.

987  {
988  const char *res_string; // pitch result
989  int16_t mid_cuts; // no of cheap cuts
990  float non_space; // gap size
991  float pitch_sd; // error on pitch
992  float sp_sd = 0.0f; // space sd
993 
994  non_space = row->fp_nonsp;
995  if (non_space > row->fixed_pitch)
996  non_space = row->fixed_pitch;
997  POLY_BLOCK* pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
998  if (textord_all_prop || (pb != nullptr && !pb->IsText())) {
999  // Set the decision to definitely proportional.
1000  pitch_sd = textord_words_def_prop * row->fixed_pitch;
1002  } else {
1003  pitch_sd = tune_row_pitch (row, &row->projection, row->projection_left,
1004  row->projection_right,
1005  (row->fixed_pitch + non_space * 3) / 4,
1006  row->fixed_pitch, sp_sd, mid_cuts,
1007  &row->char_cells,
1008  block_index == textord_debug_block);
1009  if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch
1010  && ((pitsync_linear_version & 3) < 3
1011  || ((pitsync_linear_version & 3) >= 3 && (row->used_dm_model
1012  || sp_sd > 20
1013  || (pitch_sd == 0 && sp_sd > 10))))) {
1014  if (pitch_sd < textord_words_def_fixed * row->fixed_pitch
1015  && !row->all_caps
1016  && ((pitsync_linear_version & 3) < 3 || sp_sd > 20))
1018  else
1020  }
1021  else if ((pitsync_linear_version & 3) < 3
1022  || sp_sd > 20
1023  || mid_cuts > 0
1024  || pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) {
1025  if (pitch_sd < textord_words_def_prop * row->fixed_pitch)
1027  else
1029  }
1030  else
1031  row->pitch_decision = PITCH_DUNNO;
1032  }
1033 
1035  res_string = "??";
1036  switch (row->pitch_decision) {
1037  case PITCH_DEF_PROP:
1038  res_string = "DP";
1039  break;
1040  case PITCH_MAYBE_PROP:
1041  res_string = "MP";
1042  break;
1043  case PITCH_DEF_FIXED:
1044  res_string = "DF";
1045  break;
1046  case PITCH_MAYBE_FIXED:
1047  res_string = "MF";
1048  break;
1049  default:
1050  res_string = "??";
1051  }
1052  tprintf (":sd/p=%g:occ=%g:init_res=%s\n",
1053  pitch_sd / row->fixed_pitch, sp_sd, res_string);
1054  }
1055  return true;
1056 }
bool used_dm_model
Definition: blobbox.h:660
ICOORDELT_LIST char_cells
Definition: blobbox.h:682
float fixed_pitch
Definition: blobbox.h:664
int16_t projection_right
Definition: blobbox.h:662
EXTERN bool textord_debug_pitch_metric
Definition: topitch.cpp:48
STATS projection
Definition: blobbox.h:684
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool IsText() const
Definition: polyblk.h:49
EXTERN double textord_words_pitchsd_threshold
Definition: tovars.cpp:59
EXTERN bool textord_all_prop
Definition: topitch.cpp:40
float tune_row_pitch(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float &initial_pitch, float &best_sp_sd, int16_t &best_mid_cuts, ICOORDELT_LIST *best_cells, bool testing_on)
Definition: topitch.cpp:1156
int16_t projection_left
Definition: blobbox.h:661
float fp_nonsp
Definition: blobbox.h:666
EXTERN int textord_debug_block
Definition: tovars.cpp:36
bool all_caps
Definition: blobbox.h:659
EXTERN double textord_words_def_prop
Definition: tovars.cpp:63
PDBLK pdblk
Definition: ocrblock.h:192
PITCH_TYPE pitch_decision
Definition: blobbox.h:663

◆ plot_fp_word()

void plot_fp_word ( TO_BLOCK block,
float  pitch,
float  nonspace 
)

Definition at line 1829 of file topitch.cpp.

1833  {
1834  TO_ROW *row; //current row
1835  TO_ROW_IT row_it = block->get_rows ();
1836 
1837  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1838  row = row_it.data ();
1839  row->min_space = (int32_t) ((pitch + nonspace) / 2);
1840  row->max_nonspace = row->min_space;
1841  row->space_threshold = row->min_space;
1842  plot_word_decisions (to_win, (int16_t) pitch, row);
1843  }
1844 }
void plot_word_decisions(ScrollView *win, int16_t pitch, TO_ROW *row)
Definition: drawtord.cpp:249
TO_ROW_LIST * get_rows()
Definition: blobbox.h:717
int32_t space_threshold
Definition: blobbox.h:678
EXTERN ScrollView * to_win
Definition: drawtord.cpp:37
int32_t min_space
Definition: blobbox.h:676
int32_t max_nonspace
Definition: blobbox.h:677

◆ print_block_counts()

void print_block_counts ( TO_BLOCK block,
int32_t  block_index 
)

Definition at line 627 of file topitch.cpp.

630  {
631  int32_t def_fixed = 0; //counters
632  int32_t def_prop = 0;
633  int32_t maybe_fixed = 0;
634  int32_t maybe_prop = 0;
635  int32_t dunno = 0;
636  int32_t corr_fixed = 0;
637  int32_t corr_prop = 0;
638 
639  count_block_votes(block,
640  def_fixed,
641  def_prop,
642  maybe_fixed,
643  maybe_prop,
644  corr_fixed,
645  corr_prop,
646  dunno);
647  tprintf ("Block %d has (%d,%d,%d)",
648  block_index, def_fixed, maybe_fixed, corr_fixed);
649  if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed))
650  tprintf (" (Wrongly)");
651  tprintf (" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop);
652  if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop))
653  tprintf (" (Wrongly)");
654  tprintf (" prop, %d dunno\n", dunno);
655 }
EXTERN bool textord_blocksall_fixed
Definition: tovars.cpp:29
EXTERN bool textord_blocksall_prop
Definition: tovars.cpp:31
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void count_block_votes(TO_BLOCK *block, int32_t &def_fixed, int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed, int32_t &corr_prop, int32_t &dunno)
Definition: topitch.cpp:664

◆ print_pitch_sd()

void print_pitch_sd ( TO_ROW row,
STATS projection,
int16_t  projection_left,
int16_t  projection_right,
float  space_size,
float  initial_pitch 
)

Definition at line 1635 of file topitch.cpp.

1642  {
1643  const char *res2; //pitch result
1644  int16_t occupation; //used cells
1645  float sp_sd; //space sd
1646  //blobs
1647  BLOBNBOX_IT blob_it = row->blob_list ();
1648  BLOBNBOX_IT start_it; //start of word
1649  BLOBNBOX_IT row_start; //start of row
1650  int16_t blob_count; //no of blobs
1651  int16_t total_blob_count; //total blobs in line
1652  TBOX blob_box; //bounding box
1653  TBOX prev_box; //of super blob
1654  int32_t prev_right; //of word sync
1655  int scale_factor; //on scores for big words
1656  int32_t sp_count; //spaces
1657  FPSEGPT_LIST seg_list; //char cells
1658  FPSEGPT_IT seg_it; //iterator
1659  double sqsum; //sum of squares
1660  double spsum; //of spaces
1661  double sp_var; //space error
1662  double word_sync; //result for word
1663  double total_count; //total cuts
1664 
1665  if (blob_it.empty ())
1666  return;
1667  row_start = blob_it;
1668  total_blob_count = 0;
1669 
1670  total_count = 0;
1671  sqsum = 0;
1672  sp_count = 0;
1673  spsum = 0;
1674  prev_right = -1;
1675  blob_it = row_start;
1676  start_it = blob_it;
1677  blob_count = 0;
1678  blob_box = box_next (&blob_it);//first blob
1679  blob_it.mark_cycle_pt ();
1680  do {
1681  for (; blob_count > 0; blob_count--)
1682  box_next(&start_it);
1683  do {
1684  prev_box = blob_box;
1685  blob_count++;
1686  blob_box = box_next (&blob_it);
1687  }
1688  while (!blob_it.cycled_list ()
1689  && blob_box.left () - prev_box.right () < space_size);
1690  word_sync =
1691  check_pitch_sync2 (&start_it, blob_count, (int16_t) initial_pitch, 2,
1692  projection, projection_left, projection_right,
1694  occupation, &seg_list, 0, 0);
1695  total_blob_count += blob_count;
1696  seg_it.set_to_list (&seg_list);
1697  if (prev_right >= 0) {
1698  sp_var = seg_it.data ()->position () - prev_right;
1699  sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
1700  sp_var *= sp_var;
1701  spsum += sp_var;
1702  sp_count++;
1703  }
1704  seg_it.move_to_last ();
1705  prev_right = seg_it.data ()->position ();
1707  scale_factor = (seg_list.length () - 2) / 2;
1708  if (scale_factor < 1)
1709  scale_factor = 1;
1710  }
1711  else
1712  scale_factor = 1;
1713  sqsum += word_sync * scale_factor;
1714  total_count += (seg_list.length () - 1) * scale_factor;
1715  seg_list.clear ();
1716  }
1717  while (!blob_it.cycled_list ());
1718  sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
1719  word_sync = total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
1720  tprintf ("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:",
1721  word_sync, word_sync / initial_pitch, sp_sd,
1722  word_sync < textord_words_pitchsd_threshold * initial_pitch
1723  ? 'F' : 'P');
1724 
1725  start_it = row_start;
1726  blob_it = row_start;
1727  word_sync =
1728  check_pitch_sync2 (&blob_it, total_blob_count, (int16_t) initial_pitch, 2,
1729  projection, projection_left, projection_right,
1730  row->xheight * textord_projection_scale, occupation,
1731  &seg_list, 0, 0);
1732  if (occupation > 1)
1733  word_sync /= occupation;
1734  word_sync = sqrt (word_sync);
1735 
1736 #ifndef GRAPHICS_DISABLED
1737  if (textord_show_row_cuts && to_win != nullptr)
1738  plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list);
1739 #endif
1740  seg_list.clear ();
1741  if (word_sync < textord_words_pitchsd_threshold * initial_pitch) {
1742  if (word_sync < textord_words_def_fixed * initial_pitch
1743  && !row->all_caps)
1744  res2 = "DF";
1745  else
1746  res2 = "MF";
1747  }
1748  else
1749  res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP";
1750  tprintf
1751  ("row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, all_caps=%d\n",
1752  word_sync, word_sync / initial_pitch,
1753  word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P',
1754  occupation, res2, initial_pitch, row->fixed_pitch, row->all_caps);
1755 }
float fixed_pitch
Definition: blobbox.h:664
Definition: rect.h:34
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:637
EXTERN double textord_projection_scale
Definition: topitch.cpp:55
float xheight
Definition: blobbox.h:670
double check_pitch_sync2(BLOBNBOX_IT *blob_it, int16_t blob_count, int16_t pitch, int16_t pitch_error, STATS *projection, int16_t projection_left, int16_t projection_right, float projection_scale, int16_t &occupation_count, FPSEGPT_LIST *seg_list, int16_t start, int16_t end)
Definition: pithsync.cpp:294
int16_t left() const
Definition: rect.h:72
EXTERN double textord_words_def_fixed
Definition: tovars.cpp:61
EXTERN bool textord_pitch_scalebigwords
Definition: tovars.cpp:69
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
EXTERN double textord_words_pitchsd_threshold
Definition: tovars.cpp:59
EXTERN ScrollView * to_win
Definition: drawtord.cpp:37
EXTERN bool textord_show_row_cuts
Definition: topitch.cpp:49
int16_t right() const
Definition: rect.h:79
bool all_caps
Definition: blobbox.h:659
EXTERN double textord_words_def_prop
Definition: tovars.cpp:63
void plot_fp_cells2(ScrollView *win, ScrollView::Color colour, TO_ROW *row, FPSEGPT_LIST *seg_list)
Definition: drawtord.cpp:363
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:612

◆ row_pitch_stats()

bool row_pitch_stats ( TO_ROW row,
int32_t  maxwidth,
bool  testing_on 
)

Definition at line 711 of file topitch.cpp.

715  {
716  BLOBNBOX *blob; //current blob
717  int gap_index; //current gap
718  int32_t prev_x; //end of prev blob
719  int32_t cluster_count; //no of clusters
720  int32_t prev_count; //of clusters
721  int32_t smooth_factor; //for smoothing stats
722  TBOX blob_box; //bounding box
723  float lower, upper; //cluster thresholds
724  //gap sizes
725  float gaps[BLOCK_STATS_CLUSTERS];
726  //blobs
727  BLOBNBOX_IT blob_it = row->blob_list ();
728  STATS gap_stats (0, maxwidth);
729  STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
730  //clusters
731 
732  smooth_factor =
733  (int32_t) (row->xheight * textord_wordstats_smooth_factor + 1.5);
734  if (!blob_it.empty ()) {
735  prev_x = blob_it.data ()->bounding_box ().right ();
736  blob_it.forward ();
737  while (!blob_it.at_first ()) {
738  blob = blob_it.data ();
739  if (!blob->joined_to_prev ()) {
740  blob_box = blob->bounding_box ();
741  if (blob_box.left () - prev_x < maxwidth)
742  gap_stats.add (blob_box.left () - prev_x, 1);
743  prev_x = blob_box.right ();
744  }
745  blob_it.forward ();
746  }
747  }
748  if (gap_stats.get_total () == 0) {
749  return false;
750  }
751  cluster_count = 0;
752  lower = row->xheight * words_initial_lower;
753  upper = row->xheight * words_initial_upper;
754  gap_stats.smooth (smooth_factor);
755  do {
756  prev_count = cluster_count;
757  cluster_count = gap_stats.cluster (lower, upper,
759  BLOCK_STATS_CLUSTERS, cluster_stats);
760  }
761  while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
762  if (cluster_count < 1) {
763  return false;
764  }
765  for (gap_index = 0; gap_index < cluster_count; gap_index++)
766  gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
767  //get medians
768  if (testing_on) {
769  tprintf ("cluster_count=%d:", cluster_count);
770  for (gap_index = 0; gap_index < cluster_count; gap_index++)
771  tprintf (" %g(%d)", gaps[gap_index],
772  cluster_stats[gap_index + 1].get_total ());
773  tprintf ("\n");
774  }
775  qsort (gaps, cluster_count, sizeof (float), sort_floats);
776 
777  //Try to find proportional non-space and space for row.
778  lower = row->xheight * words_default_prop_nonspace;
779  upper = row->xheight * textord_words_min_minspace;
780  for (gap_index = 0; gap_index < cluster_count
781  && gaps[gap_index] < lower; gap_index++);
782  if (gap_index == 0) {
783  if (testing_on)
784  tprintf ("No clusters below nonspace threshold!!\n");
785  if (cluster_count > 1) {
786  row->pr_nonsp = gaps[0];
787  row->pr_space = gaps[1];
788  }
789  else {
790  row->pr_nonsp = lower;
791  row->pr_space = gaps[0];
792  }
793  }
794  else {
795  row->pr_nonsp = gaps[gap_index - 1];
796  while (gap_index < cluster_count && gaps[gap_index] < upper)
797  gap_index++;
798  if (gap_index == cluster_count) {
799  if (testing_on)
800  tprintf ("No clusters above nonspace threshold!!\n");
801  row->pr_space = lower * textord_spacesize_ratioprop;
802  }
803  else
804  row->pr_space = gaps[gap_index];
805  }
806 
807  //Now try to find the fixed pitch space and non-space.
808  upper = row->xheight * words_default_fixed_space;
809  for (gap_index = 0; gap_index < cluster_count
810  && gaps[gap_index] < upper; gap_index++);
811  if (gap_index == 0) {
812  if (testing_on)
813  tprintf ("No clusters below space threshold!!\n");
814  row->fp_nonsp = upper;
815  row->fp_space = gaps[0];
816  }
817  else {
818  row->fp_nonsp = gaps[gap_index - 1];
819  if (gap_index == cluster_count) {
820  if (testing_on)
821  tprintf ("No clusters above space threshold!!\n");
822  row->fp_space = row->xheight;
823  }
824  else
825  row->fp_space = gaps[gap_index];
826  }
827  if (testing_on) {
828  tprintf
829  ("Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, fp_space=%g\n",
830  row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space);
831  }
832  return true; //computed some stats
833 }
EXTERN double textord_words_min_minspace
Definition: tovars.cpp:49
EXTERN double words_default_fixed_space
Definition: tovars.cpp:73
Definition: rect.h:34
Definition: statistc.h:33
float pr_nonsp
Definition: blobbox.h:668
EXTERN double textord_spacesize_ratioprop
Definition: tovars.cpp:80
float xheight
Definition: blobbox.h:670
int16_t left() const
Definition: rect.h:72
float pr_space
Definition: blobbox.h:667
EXTERN double words_initial_upper
Definition: tovars.cpp:71
float fp_space
Definition: blobbox.h:665
bool joined_to_prev() const
Definition: blobbox.h:257
EXTERN double words_default_prop_nonspace
Definition: tovars.cpp:72
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
EXTERN double textord_wordstats_smooth_factor
Definition: tovars.cpp:39
#define BLOCK_STATS_CLUSTERS
Definition: topitch.cpp:60
const TBOX & bounding_box() const
Definition: blobbox.h:231
float fp_nonsp
Definition: blobbox.h:666
int16_t right() const
Definition: rect.h:79
EXTERN double words_initial_lower
Definition: tovars.cpp:70
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:612

◆ try_block_fixed()

bool try_block_fixed ( TO_BLOCK block,
int32_t  block_index 
)

Definition at line 545 of file topitch.cpp.

548  {
549  return false;
550 }

◆ try_doc_fixed()

bool try_doc_fixed ( ICOORD  page_tr,
TO_BLOCK_LIST *  port_blocks,
float  gradient 
)

Definition at line 405 of file topitch.cpp.

409  {
410  int16_t master_x; //uniform shifts
411  int16_t pitch; //median pitch.
412  int x; //profile coord
413  int prop_blocks; //correct counts
414  int fixed_blocks;
415  int total_row_count; //total in page
416  //iterator
417  TO_BLOCK_IT block_it = port_blocks;
418  TO_BLOCK *block; //current block;
419  TO_ROW *row; //current row
420  int16_t projection_left; //edges
421  int16_t projection_right;
422  int16_t row_left; //edges of row
423  int16_t row_right;
424  ICOORDELT_LIST *master_cells; //cells for page
425  float master_y; //uniform shifts
426  float shift_factor; //page skew correction
427  float row_shift; //shift for row
428  float final_pitch; //output pitch
429  float row_y; //baseline
430  STATS projection; //entire page
431  STATS pitches (0, MAX_ALLOWED_PITCH);
432  //for median
433  float sp_sd; //space sd
434  int16_t mid_cuts; //no of cheap cuts
435  float pitch_sd; //sync rating
436 
437  if (block_it.empty ()
438  // || block_it.data()==block_it.data_relative(1)
440  return false;
441  shift_factor = gradient / (gradient * gradient + 1);
442  // row iterator
443  TO_ROW_IT row_it(block_it.data ()->get_rows());
444  master_x = row_it.data ()->projection_left;
445  master_y = row_it.data ()->baseline.y (master_x);
446  projection_left = INT16_MAX;
447  projection_right = -INT16_MAX;
448  prop_blocks = 0;
449  fixed_blocks = 0;
450  total_row_count = 0;
451 
452  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
453  block_it.forward ()) {
454  block = block_it.data ();
455  row_it.set_to_list (block->get_rows ());
456  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
457  row = row_it.data ();
458  total_row_count++;
459  if (row->fixed_pitch > 0)
460  pitches.add ((int32_t) (row->fixed_pitch), 1);
461  //find median
462  row_y = row->baseline.y (master_x);
463  row_left =
464  (int16_t) (row->projection_left -
465  shift_factor * (master_y - row_y));
466  row_right =
467  (int16_t) (row->projection_right -
468  shift_factor * (master_y - row_y));
469  if (row_left < projection_left)
470  projection_left = row_left;
471  if (row_right > projection_right)
472  projection_right = row_right;
473  }
474  }
475  if (pitches.get_total () == 0)
476  return false;
477  projection.set_range (projection_left, projection_right);
478 
479  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
480  block_it.forward ()) {
481  block = block_it.data ();
482  row_it.set_to_list (block->get_rows ());
483  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
484  row = row_it.data ();
485  row_y = row->baseline.y (master_x);
486  row_left =
487  (int16_t) (row->projection_left -
488  shift_factor * (master_y - row_y));
489  for (x = row->projection_left; x < row->projection_right;
490  x++, row_left++) {
491  projection.add (row_left, row->projection.pile_count (x));
492  }
493  }
494  }
495 
496  row_it.set_to_list (block_it.data ()->get_rows ());
497  row = row_it.data ();
498 #ifndef GRAPHICS_DISABLED
499  if (textord_show_page_cuts && to_win != nullptr)
500  projection.plot (to_win, projection_left,
501  row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
502 #endif
503  final_pitch = pitches.ile (0.5);
504  pitch = (int16_t) final_pitch;
505  pitch_sd =
506  tune_row_pitch (row, &projection, projection_left, projection_right,
507  pitch * 0.75, final_pitch, sp_sd, mid_cuts,
508  &row->char_cells, FALSE);
509 
511  tprintf
512  ("try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n",
513  prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd,
514  pitch_sd / total_row_count, pitch_sd / pitch,
515  pitch_sd / total_row_count / pitch);
516 
517 #ifndef GRAPHICS_DISABLED
518  if (textord_show_page_cuts && to_win != nullptr) {
519  master_cells = &row->char_cells;
520  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
521  block_it.forward ()) {
522  block = block_it.data ();
523  row_it.set_to_list (block->get_rows ());
524  for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
525  row_it.forward ()) {
526  row = row_it.data ();
527  row_y = row->baseline.y (master_x);
528  row_shift = shift_factor * (master_y - row_y);
529  plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells);
530  }
531  }
532  }
533 #endif
534  row->char_cells.clear ();
535  return false;
536 }
QSPLINE baseline
Definition: blobbox.h:683
float intercept() const
Definition: blobbox.h:601
int32_t pile_count(int32_t value) const
Definition: statistc.h:78
ICOORDELT_LIST char_cells
Definition: blobbox.h:682
EXTERN bool textord_show_page_cuts
Definition: topitch.cpp:50
float fixed_pitch
Definition: blobbox.h:664
int16_t projection_right
Definition: blobbox.h:662
Definition: statistc.h:33
double y(double x) const
Definition: quspline.cpp:209
EXTERN bool textord_debug_pitch_metric
Definition: topitch.cpp:48
TO_ROW_LIST * get_rows()
Definition: blobbox.h:717
void plot_row_cells(ScrollView *win, ScrollView::Color colour, TO_ROW *row, float xshift, ICOORDELT_LIST *cells)
Definition: drawtord.cpp:396
#define MAX_ALLOWED_PITCH
Definition: topitch.cpp:61
#define FALSE
Definition: capi.h:52
STATS projection
Definition: blobbox.h:684
bool set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1)
Definition: statistc.cpp:63
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void add(int32_t value, int32_t count)
Definition: statistc.cpp:100
float tune_row_pitch(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float &initial_pitch, float &best_sp_sd, int16_t &best_mid_cuts, ICOORDELT_LIST *best_cells, bool testing_on)
Definition: topitch.cpp:1156
EXTERN ScrollView * to_win
Definition: drawtord.cpp:37
int16_t projection_left
Definition: blobbox.h:661
EXTERN bool textord_blockndoc_fixed
Definition: topitch.cpp:54
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:584

◆ try_rows_fixed()

bool try_rows_fixed ( TO_BLOCK block,
int32_t  block_index,
bool  testing_on 
)

Definition at line 559 of file topitch.cpp.

563  {
564  TO_ROW *row; //current row
565  int32_t row_index; //row number.
566  int32_t def_fixed = 0; //counters
567  int32_t def_prop = 0;
568  int32_t maybe_fixed = 0;
569  int32_t maybe_prop = 0;
570  int32_t dunno = 0;
571  int32_t corr_fixed = 0;
572  int32_t corr_prop = 0;
573  float lower, upper; //cluster thresholds
574  TO_ROW_IT row_it = block->get_rows ();
575 
576  row_index = 1;
577  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
578  row = row_it.data ();
579  ASSERT_HOST (row->xheight > 0);
580  if (row->fixed_pitch > 0 &&
581  fixed_pitch_row(row, block->block, block_index)) {
582  if (row->fixed_pitch == 0) {
583  lower = row->pr_nonsp;
584  upper = row->pr_space;
585  row->space_size = upper;
586  row->kern_size = lower;
587  }
588  }
589  row_index++;
590  }
591  count_block_votes(block,
592  def_fixed,
593  def_prop,
594  maybe_fixed,
595  maybe_prop,
596  corr_fixed,
597  corr_prop,
598  dunno);
599  if (testing_on
602  tprintf ("Initially:");
603  print_block_counts(block, block_index);
604  }
605  if (def_fixed > def_prop * textord_words_veto_power)
607  else if (def_prop > def_fixed * textord_words_veto_power)
609  else if (def_fixed > 0 || def_prop > 0)
610  block->pitch_decision = PITCH_DUNNO;
611  else if (maybe_fixed > maybe_prop * textord_words_veto_power)
613  else if (maybe_prop > maybe_fixed * textord_words_veto_power)
615  else
616  block->pitch_decision = PITCH_DUNNO;
617  return false;
618 }
void print_block_counts(TO_BLOCK *block, int32_t block_index)
Definition: topitch.cpp:627
float fixed_pitch
Definition: blobbox.h:664
bool fixed_pitch_row(TO_ROW *row, BLOCK *block, int32_t block_index)
Definition: topitch.cpp:984
float space_size
Definition: blobbox.h:680
TO_ROW_LIST * get_rows()
Definition: blobbox.h:717
float pr_nonsp
Definition: blobbox.h:668
EXTERN bool textord_blocksall_fixed
Definition: tovars.cpp:29
float xheight
Definition: blobbox.h:670
float pr_space
Definition: blobbox.h:667
EXTERN int textord_words_veto_power
Definition: tovars.cpp:65
EXTERN bool textord_blocksall_prop
Definition: tovars.cpp:31
PITCH_TYPE pitch_decision
Definition: blobbox.h:791
float kern_size
Definition: blobbox.h:679
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
BLOCK * block
Definition: blobbox.h:790
void count_block_votes(TO_BLOCK *block, int32_t &def_fixed, int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed, int32_t &corr_prop, int32_t &dunno)
Definition: topitch.cpp:664
EXTERN bool textord_debug_pitch_test
Definition: topitch.cpp:42
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ tune_row_pitch()

float tune_row_pitch ( TO_ROW row,
STATS projection,
int16_t  projection_left,
int16_t  projection_right,
float  space_size,
float &  initial_pitch,
float &  best_sp_sd,
int16_t &  best_mid_cuts,
ICOORDELT_LIST *  best_cells,
bool  testing_on 
)

Definition at line 1156 of file topitch.cpp.

1167  {
1168  int pitch_delta; //offset pitch
1169  int16_t mid_cuts; //cheap cuts
1170  float pitch_sd; //current sd
1171  float best_sd; //best result
1172  float best_pitch; //pitch for best result
1173  float initial_sd; //starting error
1174  float sp_sd; //space sd
1175  ICOORDELT_LIST test_cells; //row cells
1176  ICOORDELT_IT best_it; //start of best list
1177 
1179  return tune_row_pitch2 (row, projection, projection_left,
1180  projection_right, space_size, initial_pitch,
1181  best_sp_sd,
1182  //space sd
1183  best_mid_cuts, best_cells, testing_on);
1185  best_sp_sd = initial_pitch;
1186  return initial_pitch;
1187  }
1188  initial_sd =
1189  compute_pitch_sd(row,
1190  projection,
1191  projection_left,
1192  projection_right,
1193  space_size,
1194  initial_pitch,
1195  best_sp_sd,
1196  best_mid_cuts,
1197  best_cells,
1198  testing_on);
1199  best_sd = initial_sd;
1200  best_pitch = initial_pitch;
1201  if (testing_on)
1202  tprintf ("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd);
1203  for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1204  pitch_sd =
1205  compute_pitch_sd (row, projection, projection_left, projection_right,
1206  space_size, initial_pitch + pitch_delta, sp_sd,
1207  mid_cuts, &test_cells, testing_on);
1208  if (testing_on)
1209  tprintf ("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta,
1210  pitch_sd);
1211  if (pitch_sd < best_sd) {
1212  best_sd = pitch_sd;
1213  best_mid_cuts = mid_cuts;
1214  best_sp_sd = sp_sd;
1215  best_pitch = initial_pitch + pitch_delta;
1216  best_cells->clear ();
1217  best_it.set_to_list (best_cells);
1218  best_it.add_list_after (&test_cells);
1219  }
1220  else
1221  test_cells.clear ();
1222  if (pitch_sd > initial_sd)
1223  break; //getting worse
1224  }
1225  for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1226  pitch_sd =
1227  compute_pitch_sd (row, projection, projection_left, projection_right,
1228  space_size, initial_pitch - pitch_delta, sp_sd,
1229  mid_cuts, &test_cells, testing_on);
1230  if (testing_on)
1231  tprintf ("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta,
1232  pitch_sd);
1233  if (pitch_sd < best_sd) {
1234  best_sd = pitch_sd;
1235  best_mid_cuts = mid_cuts;
1236  best_sp_sd = sp_sd;
1237  best_pitch = initial_pitch - pitch_delta;
1238  best_cells->clear ();
1239  best_it.set_to_list (best_cells);
1240  best_it.add_list_after (&test_cells);
1241  }
1242  else
1243  test_cells.clear ();
1244  if (pitch_sd > initial_sd)
1245  break;
1246  }
1247  initial_pitch = best_pitch;
1248 
1250  print_pitch_sd(row,
1251  projection,
1252  projection_left,
1253  projection_right,
1254  space_size,
1255  best_pitch);
1256 
1257  return best_sd;
1258 }
float tune_row_pitch2(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float &initial_pitch, float &best_sp_sd, int16_t &best_mid_cuts, ICOORDELT_LIST *best_cells, bool testing_on)
Definition: topitch.cpp:1268
EXTERN bool textord_fast_pitch_test
Definition: topitch.cpp:46
EXTERN bool textord_debug_pitch_metric
Definition: topitch.cpp:48
EXTERN int textord_pitch_range
Definition: tovars.cpp:37
EXTERN bool textord_disable_pitch_test
Definition: topitch.cpp:44
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void print_pitch_sd(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float initial_pitch)
Definition: topitch.cpp:1635
float compute_pitch_sd(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float initial_pitch, float &sp_sd, int16_t &mid_cuts, ICOORDELT_LIST *row_cells, bool testing_on, int16_t start, int16_t end)
Definition: topitch.cpp:1380

◆ tune_row_pitch2()

float tune_row_pitch2 ( TO_ROW row,
STATS projection,
int16_t  projection_left,
int16_t  projection_right,
float  space_size,
float &  initial_pitch,
float &  best_sp_sd,
int16_t &  best_mid_cuts,
ICOORDELT_LIST *  best_cells,
bool  testing_on 
)

Definition at line 1268 of file topitch.cpp.

1279  {
1280  int pitch_delta; //offset pitch
1281  int16_t pixel; //pixel coord
1282  int16_t best_pixel; //pixel coord
1283  int16_t best_delta; //best pitch
1284  int16_t best_pitch; //best pitch
1285  int16_t start; //of good range
1286  int16_t end; //of good range
1287  int32_t best_count; //lowest sum
1288  float best_sd; //best result
1289 
1290  best_sp_sd = initial_pitch;
1291 
1292  best_pitch = static_cast<int>(initial_pitch);
1293  if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) {
1294  return initial_pitch;
1295  }
1296  std::unique_ptr<STATS[]> sum_proj(new STATS[textord_pitch_range * 2 + 1]); //summed projection
1297 
1298  for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
1299  pitch_delta++)
1300  sum_proj[textord_pitch_range + pitch_delta].set_range (0,
1301  best_pitch +
1302  pitch_delta + 1);
1303  for (pixel = projection_left; pixel <= projection_right; pixel++) {
1304  for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
1305  pitch_delta++) {
1306  sum_proj[textord_pitch_range + pitch_delta].add(
1307  (pixel - projection_left) % (best_pitch + pitch_delta),
1308  projection->pile_count(pixel));
1309  }
1310  }
1311  best_count = sum_proj[textord_pitch_range].pile_count (0);
1312  best_delta = 0;
1313  best_pixel = 0;
1314  for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
1315  pitch_delta++) {
1316  for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) {
1317  if (sum_proj[textord_pitch_range + pitch_delta].pile_count (pixel)
1318  < best_count) {
1319  best_count =
1320  sum_proj[textord_pitch_range +
1321  pitch_delta].pile_count (pixel);
1322  best_delta = pitch_delta;
1323  best_pixel = pixel;
1324  }
1325  }
1326  }
1327  if (testing_on)
1328  tprintf ("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n",
1329  initial_pitch, best_delta, best_count);
1330  best_pitch += best_delta;
1331  initial_pitch = best_pitch;
1332  best_count++;
1333  best_count += best_count;
1334  for (start = best_pixel - 2; start > best_pixel - best_pitch
1335  && sum_proj[textord_pitch_range +
1336  best_delta].pile_count (start % best_pitch) <= best_count;
1337  start--);
1338  for (end = best_pixel + 2;
1339  end < best_pixel + best_pitch
1340  && sum_proj[textord_pitch_range +
1341  best_delta].pile_count (end % best_pitch) <= best_count;
1342  end++);
1343 
1344  best_sd =
1345  compute_pitch_sd(row,
1346  projection,
1347  projection_left,
1348  projection_right,
1349  space_size,
1350  initial_pitch,
1351  best_sp_sd,
1352  best_mid_cuts,
1353  best_cells,
1354  testing_on,
1355  start,
1356  end);
1357  if (testing_on)
1358  tprintf ("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch,
1359  best_sd);
1360 
1362  print_pitch_sd(row,
1363  projection,
1364  projection_left,
1365  projection_right,
1366  space_size,
1367  initial_pitch);
1368 
1369  return best_sd;
1370 }
int32_t pile_count(int32_t value) const
Definition: statistc.h:78
Definition: statistc.h:33
EXTERN bool textord_debug_pitch_metric
Definition: topitch.cpp:48
EXTERN int textord_pitch_range
Definition: tovars.cpp:37
EXTERN bool textord_disable_pitch_test
Definition: topitch.cpp:44
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void print_pitch_sd(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float initial_pitch)
Definition: topitch.cpp:1635
float compute_pitch_sd(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float initial_pitch, float &sp_sd, int16_t &mid_cuts, ICOORDELT_LIST *row_cells, bool testing_on, int16_t start, int16_t end)
Definition: topitch.cpp:1380

Variable Documentation

◆ textord_all_prop

EXTERN bool textord_all_prop = FALSE

"All doc is proportial text"

Definition at line 40 of file topitch.cpp.

◆ textord_balance_factor

EXTERN double textord_balance_factor = 1.0

"Ding rate for unbalanced char cells"

Definition at line 57 of file topitch.cpp.

◆ textord_blockndoc_fixed

EXTERN bool textord_blockndoc_fixed = FALSE

"Attempt whole doc/block fixed pitch"

Definition at line 54 of file topitch.cpp.

◆ textord_debug_pitch_metric

EXTERN bool textord_debug_pitch_metric = FALSE

"Write full metric stuff"

Definition at line 48 of file topitch.cpp.

◆ textord_debug_pitch_test

EXTERN bool textord_debug_pitch_test = FALSE

"Debug on fixed pitch test"

Definition at line 42 of file topitch.cpp.

◆ textord_disable_pitch_test

EXTERN bool textord_disable_pitch_test = FALSE

"Turn off dp fixed pitch algorithm"

Definition at line 44 of file topitch.cpp.

◆ textord_fast_pitch_test

EXTERN bool textord_fast_pitch_test = FALSE

"Do even faster pitch algorithm"

Definition at line 46 of file topitch.cpp.

◆ textord_pitch_cheat

EXTERN bool textord_pitch_cheat = FALSE

"Use correct answer for fixed/prop"

Definition at line 52 of file topitch.cpp.

◆ textord_projection_scale

EXTERN double textord_projection_scale = 0.200

"Ding rate for mid-cuts"

Definition at line 55 of file topitch.cpp.

◆ textord_show_page_cuts

EXTERN bool textord_show_page_cuts = FALSE

"Draw page-level cuts"

Definition at line 50 of file topitch.cpp.

◆ textord_show_row_cuts

EXTERN bool textord_show_row_cuts = FALSE

"Draw row-level cuts"

Definition at line 49 of file topitch.cpp.