tesseract  4.0.0-1-g2a2b
tospace.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 /**********************************************************************
11  * tospace.cpp
12  *
13  * Compute fuzzy word spacing thresholds for each row.
14  * I.e. set : max_nonspace
15  * space_threshold
16  * min_space
17  * kern_size
18  * space_size
19  * for each row.
20  * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
21  *
22  * Note: functions in this file were originally not members of any
23  * class or enclosed by any namespace. Now they are all static members
24  * of the Textord class.
25  *
26  **********************************************************************/
27 
28 #include "drawtord.h"
29 #include "statistc.h"
30 #include "textord.h"
31 #include "tovars.h"
32 
33 // Include automatically generated configuration file if running autoconf.
34 #ifdef HAVE_CONFIG_H
35 #include "config_auto.h"
36 #endif
37 
38 #include <algorithm>
39 #include <memory>
40 
41 #define MAXSPACING 128 /*max expected spacing in pix */
42 
43 namespace tesseract {
45  ICOORD page_tr, //topright of page
46  TO_BLOCK_LIST *blocks //blocks on page
47  ) {
48  TO_BLOCK_IT block_it; //iterator
49  TO_BLOCK *block; //current block;
50  TO_ROW *row; //current row
51  int block_index; //block number
52  int row_index; //row number
53  //estimated width of real spaces for whole block
54  int16_t block_space_gap_width;
55  //estimated width of non space gaps for whole block
56  int16_t block_non_space_gap_width;
57  bool old_text_ord_proportional;//old fixed/prop result
58 
59  block_it.set_to_list (blocks);
60  block_index = 1;
61  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
62  block_it.forward ()) {
63  block = block_it.data ();
64  std::unique_ptr<GAPMAP> gapmap(new GAPMAP (block)); //map of big vert gaps in blk
65  block_spacing_stats(block,
66  gapmap.get(),
67  old_text_ord_proportional,
68  block_space_gap_width,
69  block_non_space_gap_width);
70  // Make sure relative values of block-level space and non-space gap
71  // widths are reasonable. The ratio of 1:3 is also used in
72  // block_spacing_stats, to corrrect the block_space_gap_width
73  // Useful for arabic and hindi, when the non-space gap width is
74  // often over-estimated and should not be trusted. A similar ratio
75  // is found in block_spacing_stats.
77  (float) block_space_gap_width / block_non_space_gap_width < 3.0) {
78  block_non_space_gap_width = (int16_t) floor (block_space_gap_width / 3.0);
79  }
80  // row iterator
81  TO_ROW_IT row_it(block->get_rows());
82  row_index = 1;
83  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
84  row = row_it.data ();
85  if ((row->pitch_decision == PITCH_DEF_PROP) ||
86  (row->pitch_decision == PITCH_CORR_PROP)) {
87  if ((tosp_debug_level > 0) && !old_text_ord_proportional)
88  tprintf ("Block %d Row %d: Now Proportional\n",
89  block_index, row_index);
90  row_spacing_stats(row,
91  gapmap.get(),
92  block_index,
93  row_index,
94  block_space_gap_width,
95  block_non_space_gap_width);
96  }
97  else {
98  if ((tosp_debug_level > 0) && old_text_ord_proportional)
99  tprintf
100  ("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n",
101  block_index, row_index, row->pitch_decision,
102  row->fixed_pitch);
103  }
104 #ifndef GRAPHICS_DISABLED
106  plot_word_decisions (to_win, (int16_t) row->fixed_pitch, row);
107 #endif
108  row_index++;
109  }
110  block_index++;
111  }
112 }
113 
114 
115 /*************************************************************************
116  * block_spacing_stats()
117  *************************************************************************/
118 
119 void Textord::block_spacing_stats(
120  TO_BLOCK* block,
121  GAPMAP* gapmap,
122  bool& old_text_ord_proportional,
123  int16_t& block_space_gap_width, // resulting estimate
124  int16_t& block_non_space_gap_width // resulting estimate
125 ) {
126  TO_ROW *row; // current row
127  BLOBNBOX_IT blob_it; // iterator
128 
129  STATS centre_to_centre_stats (0, MAXSPACING);
130  // DEBUG USE ONLY
131  STATS all_gap_stats (0, MAXSPACING);
132  STATS space_gap_stats (0, MAXSPACING);
133  int16_t minwidth = MAXSPACING; // narrowest blob
134  TBOX blob_box;
135  TBOX prev_blob_box;
136  int16_t centre_to_centre;
137  int16_t gap_width;
138  float real_space_threshold;
139  float iqr_centre_to_centre; // DEBUG USE ONLY
140  float iqr_all_gap_stats; // DEBUG USE ONLY
141  int32_t end_of_row;
142  int32_t row_length;
143 
144  // row iterator
145  TO_ROW_IT row_it(block->get_rows());
146  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
147  row = row_it.data ();
148  if (!row->blob_list ()->empty () &&
150  (row->pitch_decision == PITCH_DEF_PROP) ||
151  (row->pitch_decision == PITCH_CORR_PROP))) {
152  blob_it.set_to_list (row->blob_list ());
153  blob_it.mark_cycle_pt ();
154  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
156  blob_box = box_next_pre_chopped (&blob_it);
157  else if (tosp_stats_use_xht_gaps)
158  blob_box = reduced_box_next (row, &blob_it);
159  else
160  blob_box = box_next (&blob_it);
161  row_length = end_of_row - blob_box.left ();
162  if (blob_box.width () < minwidth)
163  minwidth = blob_box.width ();
164  prev_blob_box = blob_box;
165  while (!blob_it.cycled_list ()) {
167  blob_box = box_next_pre_chopped (&blob_it);
168  else if (tosp_stats_use_xht_gaps)
169  blob_box = reduced_box_next (row, &blob_it);
170  else
171  blob_box = box_next (&blob_it);
172  if (blob_box.width () < minwidth)
173  minwidth = blob_box.width ();
174  int16_t left = prev_blob_box.right();
175  int16_t right = blob_box.left();
176  gap_width = right - left;
177  if (!ignore_big_gap(row, row_length, gapmap, left, right)) {
178  all_gap_stats.add (gap_width, 1);
179 
180  centre_to_centre = (right + blob_box.right () -
181  (prev_blob_box.left () + left)) / 2;
182  //DEBUG
183  centre_to_centre_stats.add (centre_to_centre, 1);
184  // DEBUG
185  }
186  prev_blob_box = blob_box;
187  }
188  }
189  }
190 
191  //Inadequate samples
192  if (all_gap_stats.get_total () <= 1) {
193  block_non_space_gap_width = minwidth;
194  block_space_gap_width = -1; //No est. space width
195  //DEBUG
196  old_text_ord_proportional = true;
197  }
198  else {
199  /* For debug only ..... */
200  iqr_centre_to_centre = centre_to_centre_stats.ile (0.75) -
201  centre_to_centre_stats.ile (0.25);
202  iqr_all_gap_stats = all_gap_stats.ile (0.75) - all_gap_stats.ile (0.25);
203  old_text_ord_proportional =
204  iqr_centre_to_centre * 2 > iqr_all_gap_stats;
205  /* .......For debug only */
206 
207  /*
208  The median of the gaps is used as an estimate of the NON-SPACE gap width.
209  This RELIES on the assumption that there are more gaps WITHIN words than
210  BETWEEN words in a block
211 
212  Now try to estimate the width of a real space for all real spaces in the
213  block. Do this by using a crude threshold to ignore "narrow" gaps, then
214  find the median of the "wide" gaps and use this.
215  */
216  block_non_space_gap_width = (int16_t) floor (all_gap_stats.median ());
217  // median gap
218 
219  row_it.set_to_list (block->get_rows ());
220  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
221  row = row_it.data ();
222  if (!row->blob_list ()->empty () &&
224  (row->pitch_decision == PITCH_DEF_PROP) ||
225  (row->pitch_decision == PITCH_CORR_PROP))) {
226  real_space_threshold =
227  std::max (tosp_init_guess_kn_mult * block_non_space_gap_width,
229  blob_it.set_to_list (row->blob_list ());
230  blob_it.mark_cycle_pt ();
231  end_of_row =
232  blob_it.data_relative (-1)->bounding_box ().right ();
234  blob_box = box_next_pre_chopped (&blob_it);
235  else if (tosp_stats_use_xht_gaps)
236  blob_box = reduced_box_next (row, &blob_it);
237  else
238  blob_box = box_next (&blob_it);
239  row_length = blob_box.left () - end_of_row;
240  prev_blob_box = blob_box;
241  while (!blob_it.cycled_list ()) {
243  blob_box = box_next_pre_chopped (&blob_it);
244  else if (tosp_stats_use_xht_gaps)
245  blob_box = reduced_box_next (row, &blob_it);
246  else
247  blob_box = box_next (&blob_it);
248  int16_t left = prev_blob_box.right();
249  int16_t right = blob_box.left();
250  gap_width = right - left;
251  if ((gap_width > real_space_threshold) &&
252  !ignore_big_gap(row, row_length, gapmap, left, right)) {
253  /*
254  If tosp_use_cert_spaces is enabled, the estimate of the space gap is
255  restricted to obvious spaces - those wider than half the xht or those
256  with wide blobs on both sides - i.e not things that are suspect 1's or
257  punctuation that is sometimes widely spaced.
258  */
260  (gap_width >
262  ||
263  ((gap_width >
266  || (!narrow_blob (row, prev_blob_box)
267  && !narrow_blob (row, blob_box))))
268  || (wide_blob (row, prev_blob_box)
269  && wide_blob (row, blob_box)))
270  space_gap_stats.add (gap_width, 1);
271  }
272  prev_blob_box = blob_box;
273  }
274  }
275  }
276  //Inadequate samples
277  if (space_gap_stats.get_total () <= 2)
278  block_space_gap_width = -1;//No est. space width
279  else
280  block_space_gap_width =
281  std::max(static_cast<int16_t>(floor(space_gap_stats.median())),
282  static_cast<int16_t>(3 * block_non_space_gap_width));
283  }
284 }
285 
286 
287 /*************************************************************************
288  * row_spacing_stats()
289  * Set values for min_space, max_non_space based on row stats only
290  * If failure - return 0 values.
291  *************************************************************************/
292 void Textord::row_spacing_stats(
293  TO_ROW *row,
294  GAPMAP *gapmap,
295  int16_t block_idx,
296  int16_t row_idx,
297  int16_t block_space_gap_width, //estimate for block
298  int16_t block_non_space_gap_width //estimate for block
299  ) {
300  //iterator
301  BLOBNBOX_IT blob_it = row->blob_list ();
302  STATS all_gap_stats (0, MAXSPACING);
303  STATS cert_space_gap_stats (0, MAXSPACING);
304  STATS all_space_gap_stats (0, MAXSPACING);
305  STATS small_gap_stats (0, MAXSPACING);
306  TBOX blob_box;
307  TBOX prev_blob_box;
308  int16_t gap_width;
309  int16_t real_space_threshold = 0;
310  int16_t max = 0;
311  int16_t index;
312  int16_t large_gap_count = 0;
313  bool suspected_table;
314  int32_t max_max_nonspace; //upper bound
315  bool good_block_space_estimate = block_space_gap_width > 0;
316  int32_t end_of_row;
317  int32_t row_length = 0;
318  float sane_space;
319  int32_t sane_threshold;
320 
321  /* Collect first pass stats for row */
322 
323  if (!good_block_space_estimate)
324  block_space_gap_width = int16_t (floor (row->xheight / 2));
325  if (!row->blob_list ()->empty ()) {
326  if (tosp_threshold_bias1 > 0)
327  real_space_threshold =
328  block_non_space_gap_width +
329  int16_t (floor (0.5 +
330  tosp_threshold_bias1 * (block_space_gap_width -
331  block_non_space_gap_width)));
332  else
333  real_space_threshold = //Old TO method
334  (block_space_gap_width + block_non_space_gap_width) / 2;
335  blob_it.set_to_list (row->blob_list ());
336  blob_it.mark_cycle_pt ();
337  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
339  blob_box = box_next_pre_chopped (&blob_it);
340  else if (tosp_stats_use_xht_gaps)
341  blob_box = reduced_box_next (row, &blob_it);
342  else
343  blob_box = box_next (&blob_it);
344  row_length = end_of_row - blob_box.left ();
345  prev_blob_box = blob_box;
346  while (!blob_it.cycled_list ()) {
348  blob_box = box_next_pre_chopped (&blob_it);
349  else if (tosp_stats_use_xht_gaps)
350  blob_box = reduced_box_next (row, &blob_it);
351  else
352  blob_box = box_next (&blob_it);
353  int16_t left = prev_blob_box.right();
354  int16_t right = blob_box.left();
355  gap_width = right - left;
356  if (ignore_big_gap(row, row_length, gapmap, left, right)) {
357  large_gap_count++;
358  } else {
359  if (gap_width >= real_space_threshold) {
361  (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
362  ((gap_width > tosp_fuzzy_space_factor1 * row->xheight)
364  || (!narrow_blob (row, prev_blob_box)
365  && !narrow_blob (row, blob_box))))
366  || (wide_blob (row, prev_blob_box)
367  && wide_blob (row, blob_box)))
368  cert_space_gap_stats.add (gap_width, 1);
369  all_space_gap_stats.add (gap_width, 1);
370  }
371  else
372  small_gap_stats.add (gap_width, 1);
373  all_gap_stats.add (gap_width, 1);
374  }
375  prev_blob_box = blob_box;
376  }
377  }
378  suspected_table = (large_gap_count > 1) ||
379  ((large_gap_count > 0) &&
380  (all_gap_stats.get_total () <= tosp_few_samples));
381 
382  /* Now determine row kern size, space size and threshold */
383 
384  if ((cert_space_gap_stats.get_total () >=
386  ((suspected_table ||
387  all_gap_stats.get_total () <= tosp_short_row) &&
388  cert_space_gap_stats.get_total () > 0)) {
389  old_to_method(row,
390  &all_gap_stats,
391  &cert_space_gap_stats,
392  &small_gap_stats,
393  block_space_gap_width,
394  block_non_space_gap_width);
395  } else {
397  !isolated_row_stats (row, gapmap, &all_gap_stats, suspected_table,
398  block_idx, row_idx)) {
400  tprintf ("B:%d R:%d -- Inadequate certain spaces.\n",
401  block_idx, row_idx);
402  if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
403  //Use block default
404  row->space_size = block_space_gap_width;
405  if (all_gap_stats.get_total () > tosp_redo_kern_limit)
406  row->kern_size = all_gap_stats.median ();
407  else
408  row->kern_size = block_non_space_gap_width;
409  row->space_threshold =
410  int32_t (floor ((row->space_size + row->kern_size) /
412  }
413  else
414  old_to_method(row,
415  &all_gap_stats,
416  &all_space_gap_stats,
417  &small_gap_stats,
418  block_space_gap_width,
419  block_non_space_gap_width);
420  }
421  }
422 
423  if (tosp_improve_thresh && !suspected_table)
424  improve_row_threshold(row, &all_gap_stats);
425 
426  /* Now lets try to be careful not to do anything silly with tables when we
427  are ignoring big gaps*/
428  if (tosp_sanity_method == 0) {
429  if (suspected_table &&
430  (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
431  if (tosp_debug_level > 5)
432  tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx,
433  row_idx, row->kern_size, row->space_threshold, row->space_size);
434  row->space_threshold =
435  (int32_t) (tosp_table_kn_sp_ratio * row->kern_size);
436  row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
437  }
438  }
439  else if (tosp_sanity_method == 1) {
440  sane_space = row->space_size;
441  /* NEVER let space size get too close to kern size */
442  if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f))
443  || ((row->space_size - row->kern_size) <
444  (tosp_silly_kn_sp_gap * row->xheight))) {
445  if (good_block_space_estimate &&
446  (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size))
447  sane_space = block_space_gap_width;
448  else
449  sane_space =
450  std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f),
451  row->xheight / 2.0f);
452  if (tosp_debug_level > 5)
453  tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n",
454  block_idx, row_idx, row->kern_size, row->space_threshold,
455  row->space_size, sane_space);
456  row->space_size = sane_space;
457  row->space_threshold =
458  int32_t (floor ((row->space_size + row->kern_size) /
460  }
461  /* NEVER let threshold get VERY far away from kern */
462  sane_threshold = int32_t (floor (tosp_max_sane_kn_thresh *
463  std::max(row->kern_size, 2.5f)));
464  if (row->space_threshold > sane_threshold) {
465  if (tosp_debug_level > 5)
466  tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n",
467  block_idx, row_idx, row->kern_size, row->space_threshold,
468  row->space_size, sane_threshold);
469  row->space_threshold = sane_threshold;
470  if (row->space_size <= sane_threshold)
471  row->space_size = row->space_threshold + 1.0f;
472  }
473  /* Beware of tables - there may be NO spaces */
474  if (suspected_table) {
475  sane_space = std::max(tosp_table_kn_sp_ratio * row->kern_size,
477  sane_threshold = int32_t (floor ((sane_space + row->kern_size) / 2));
478 
479  if ((row->space_size < sane_space) ||
480  (row->space_threshold < sane_threshold)) {
481  if (tosp_debug_level > 5)
482  tprintf ("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n",
483  block_idx, row_idx,
484  row->kern_size,
485  row->space_threshold, row->space_size);
486  //the minimum sane value
487  row->space_threshold = (int32_t) sane_space;
488  row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
489  }
490  }
491  }
492 
493  /* Now lets try to put some error limits on the threshold */
494 
495  if (tosp_old_to_method) {
496  /* Old textord made a space if gap >= threshold */
497  //NO FUZZY SPACES YET
498  row->max_nonspace = row->space_threshold;
499  //NO FUZZY SPACES YET
500  row->min_space = row->space_threshold + 1;
501  }
502  else {
503  /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
504  row->min_space =
505  std::min(int32_t (ceil (tosp_fuzzy_space_factor * row->xheight)),
506  int32_t (row->space_size));
507  if (row->min_space <= row->space_threshold)
508  // Don't be silly
509  row->min_space = row->space_threshold + 1;
510  /*
511  Lets try to guess the max certain kern gap by looking at the cluster of
512  kerns for the row. The row is proportional so the kerns should cluster
513  tightly at the bottom of the distribution. We also expect most gaps to be
514  kerns. Find the maximum of the kern piles between 0 and twice the kern
515  estimate. Piles before the first one with less than 1/10 the maximum
516  number of samples can be taken as certain kerns.
517 
518  Of course, there are some cases where the kern peak and space peaks merge,
519  so we will put an UPPER limit on the max certain kern gap of some fraction
520  below the threshold.
521  */
522 
523  max_max_nonspace = int32_t ((row->space_threshold + row->kern_size) / 2);
524 
525  //default
526  row->max_nonspace = max_max_nonspace;
527  for (index = 0; index <= max_max_nonspace; index++) {
528  if (all_gap_stats.pile_count (index) > max)
529  max = all_gap_stats.pile_count (index);
530  if ((index > row->kern_size) &&
531  (all_gap_stats.pile_count (index) < 0.1 * max)) {
532  row->max_nonspace = index;
533  break;
534  }
535  }
536  }
537 
538  /* Yet another algorithm - simpler this time - just choose a fraction of the
539  threshold to space range */
540 
541  if ((tosp_fuzzy_sp_fraction > 0) &&
542  (row->space_size > row->space_threshold))
543  row->min_space = std::max(row->min_space,
544  (int32_t) ceil (row->space_threshold +
546  (row->space_size -
547  row->space_threshold)));
548 
549  /* Ensure that ANY space less than some multiplier times the kern size is
550  fuzzy. In tables there is a risk of erroneously setting a small space size
551  when there are no real spaces. Sometimes tables have text squashed into
552  columns so that the kn->sp ratio is small anyway - this means that we can't
553  use this to force a wider separation - hence we rely on context to join any
554  dubious breaks. */
555 
556  if ((tosp_table_fuzzy_kn_sp_ratio > 0) &&
557  (suspected_table || tosp_fuzzy_limit_all))
558  row->min_space = std::max(row->min_space,
559  (int32_t) ceil (tosp_table_fuzzy_kn_sp_ratio *
560  row->kern_size));
561 
562  if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
563  row->max_nonspace = (int32_t) floor (0.5 + row->kern_size +
565  (row->space_threshold -
566  row->kern_size));
567  }
568  if (row->max_nonspace > row->space_threshold) {
569  // Don't be silly
570  row->max_nonspace = row->space_threshold;
571  }
572 
573  if (tosp_debug_level > 5)
574  tprintf
575  ("B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) Sp:%3.2f\n",
576  block_idx, row_idx, row_length, block_non_space_gap_width,
577  block_space_gap_width, real_space_threshold, row->kern_size,
578  row->max_nonspace, row->space_threshold, row->min_space,
579  row->space_size);
580  if (tosp_debug_level > 10)
581  tprintf("row->kern_size = %3.2f, row->space_size = %3.2f, "
582  "row->space_threshold = %d\n",
583  row->kern_size, row->space_size, row->space_threshold);
584 }
585 
586 void Textord::old_to_method(
587  TO_ROW *row,
588  STATS *all_gap_stats,
589  STATS *space_gap_stats,
590  STATS *small_gap_stats,
591  int16_t block_space_gap_width, //estimate for block
592  int16_t block_non_space_gap_width //estimate for block
593  ) {
594  /* First, estimate row space size */
595  /* Old to condition was > 2 */
596  if (space_gap_stats->get_total () >= tosp_enough_space_samples_for_median) {
597  //Adequate samples
598  /* Set space size to median of spaces BUT limits it if it seems wildly out */
599  row->space_size = space_gap_stats->median ();
600  if (row->space_size > block_space_gap_width * 1.5) {
602  row->space_size = block_space_gap_width * 1.5;
603  else
604  //BUG??? should be *1.5
605  row->space_size = block_space_gap_width;
606  }
607  if (row->space_size < (block_non_space_gap_width * 2) + 1)
608  row->space_size = (block_non_space_gap_width * 2) + 1;
609  }
610  //Only 1 or 2 samples
611  else if (space_gap_stats->get_total () >= 1) {
612  //hence mean not median
613  row->space_size = space_gap_stats->mean ();
614  if (row->space_size > block_space_gap_width * 1.5) {
616  row->space_size = block_space_gap_width * 1.5;
617  else
618  //BUG??? should be *1.5
619  row->space_size = block_space_gap_width;
620  }
621  if (row->space_size < (block_non_space_gap_width * 3) + 1)
622  row->space_size = (block_non_space_gap_width * 3) + 1;
623  }
624  else {
625  //Use block default
626  row->space_size = block_space_gap_width;
627  }
628 
629  /* Next, estimate row kern size */
631  (small_gap_stats->get_total () > tosp_redo_kern_limit))
632  row->kern_size = small_gap_stats->median ();
633  else if (all_gap_stats->get_total () > tosp_redo_kern_limit)
634  row->kern_size = all_gap_stats->median ();
635  else //old TO -SAME FOR ALL ROWS
636  row->kern_size = block_non_space_gap_width;
637 
638  /* Finally, estimate row space threshold */
639  if (tosp_threshold_bias2 > 0) {
640  row->space_threshold =
641  int32_t (floor (0.5 + row->kern_size +
643  row->kern_size)));
644  } else {
645  /*
646  NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold
647  and holds this in a float. The use is with a >= test
648  NEW textord uses an integer threshold and a > test
649  It comes to the same thing.
650  (Though there is a difference in that old textor has integer space_size
651  and kern_size.)
652  */
653  row->space_threshold =
654  int32_t (floor ((row->space_size + row->kern_size) / 2));
655  }
656 
657  // Apply the same logic and ratios as in row_spacing_stats to
658  // restrict relative values of the row's space_size, kern_size, and
659  // space_threshold
661  ((row->space_size <
662  tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
663  ((row->space_size - row->kern_size) <
664  tosp_silly_kn_sp_gap * row->xheight))) {
665  if (row->kern_size > 2.5)
667  row->space_threshold = int32_t (floor ((row->space_size + row->kern_size) /
669  }
670 }
671 
672 
673 /*************************************************************************
674  * isolated_row_stats()
675  * Set values for min_space, max_non_space based on row stats only
676  *************************************************************************/
677 bool Textord::isolated_row_stats(TO_ROW* row,
678  GAPMAP* gapmap,
679  STATS* all_gap_stats,
680  bool suspected_table,
681  int16_t block_idx,
682  int16_t row_idx) {
683  float kern_estimate;
684  float crude_threshold_estimate;
685  int16_t small_gaps_count;
686  int16_t total;
687  //iterator
688  BLOBNBOX_IT blob_it = row->blob_list ();
689  STATS cert_space_gap_stats (0, MAXSPACING);
690  STATS all_space_gap_stats (0, MAXSPACING);
691  STATS small_gap_stats (0, MAXSPACING);
692  TBOX blob_box;
693  TBOX prev_blob_box;
694  int16_t gap_width;
695  int32_t end_of_row;
696  int32_t row_length;
697 
698  kern_estimate = all_gap_stats->median ();
699  crude_threshold_estimate = std::max(tosp_init_guess_kn_mult * kern_estimate,
701  small_gaps_count = stats_count_under (all_gap_stats,
702  (int16_t)
703  ceil (crude_threshold_estimate));
704  total = all_gap_stats->get_total ();
705 
706  if ((total <= tosp_redo_kern_limit) ||
707  ((small_gaps_count / (float) total) < tosp_enough_small_gaps) ||
708  (total - small_gaps_count < 1)) {
709  if (tosp_debug_level > 5)
710  tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx,
711  row_idx);
712  return false;
713  }
714  blob_it.set_to_list (row->blob_list ());
715  blob_it.mark_cycle_pt ();
716  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
718  blob_box = box_next_pre_chopped (&blob_it);
719  else if (tosp_stats_use_xht_gaps)
720  blob_box = reduced_box_next (row, &blob_it);
721  else
722  blob_box = box_next (&blob_it);
723  row_length = end_of_row - blob_box.left ();
724  prev_blob_box = blob_box;
725  while (!blob_it.cycled_list ()) {
727  blob_box = box_next_pre_chopped (&blob_it);
728  else if (tosp_stats_use_xht_gaps)
729  blob_box = reduced_box_next (row, &blob_it);
730  else
731  blob_box = box_next (&blob_it);
732  int16_t left = prev_blob_box.right();
733  int16_t right = blob_box.left();
734  gap_width = right - left;
735  if (!ignore_big_gap(row, row_length, gapmap, left, right) &&
736  (gap_width > crude_threshold_estimate)) {
737  if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
738  ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
740  (!narrow_blob (row, prev_blob_box) &&
741  !narrow_blob (row, blob_box)))) ||
742  (wide_blob (row, prev_blob_box) && wide_blob (row, blob_box)))
743  cert_space_gap_stats.add (gap_width, 1);
744  all_space_gap_stats.add (gap_width, 1);
745  }
746  if (gap_width < crude_threshold_estimate)
747  small_gap_stats.add (gap_width, 1);
748 
749  prev_blob_box = blob_box;
750  }
751  if (cert_space_gap_stats.get_total () >=
753  //median
754  row->space_size = cert_space_gap_stats.median ();
755  else if (suspected_table && (cert_space_gap_stats.get_total () > 0))
756  //to avoid spaced
757  row->space_size = cert_space_gap_stats.mean ();
758  // 1's in tables
759  else if (all_space_gap_stats.get_total () >=
761  //median
762  row->space_size = all_space_gap_stats.median ();
763  else
764  row->space_size = all_space_gap_stats.mean ();
765 
767  row->kern_size = small_gap_stats.median ();
768  else
769  row->kern_size = all_gap_stats->median ();
770  row->space_threshold =
771  int32_t (floor ((row->space_size + row->kern_size) / 2));
772  /* Sanity check */
773  if ((row->kern_size >= row->space_threshold) ||
774  (row->space_threshold >= row->space_size) ||
775  (row->space_threshold <= 0)) {
776  if (tosp_debug_level > 5)
777  tprintf ("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n",
778  block_idx, row_idx,
779  row->kern_size, row->space_threshold, row->space_size);
780  row->kern_size = 0.0f;
781  row->space_threshold = 0;
782  row->space_size = 0.0f;
783  return false;
784  }
785 
786  if (tosp_debug_level > 5)
787  tprintf ("B:%d R:%d -- Isolated row stats: %f %d %f\n",
788  block_idx, row_idx,
789  row->kern_size, row->space_threshold, row->space_size);
790  return true;
791 }
792 
793 int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) {
794  int16_t index;
795  int16_t total = 0;
796 
797  for (index = 0; index < threshold; index++)
798  total += stats->pile_count (index);
799  return total;
800 }
801 
802 
803 /*************************************************************************
804  * improve_row_threshold()
805  * Try to recognise a "normal line" -
806  * > 25 gaps
807  * && space > 3 * kn && space > 10
808  * (I.e. reasonably large space and kn:sp ratio)
809  * && > 3/4 # gaps < kn + (sp - kn)/3
810  * (I.e. most gaps are well away from space estimate)
811  * && a gap of max(3, (sp - kn) / 3) empty histogram positions is found
812  * somewhere in the histogram between kn and sp
813  * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
814  * NO!!!!! the bristol line has "11" with a gap of 12 between the 1's!!!
815  * try moving the default threshold to within this band but leave the
816  * fuzzy limit calculation as at present.
817  *************************************************************************/
818 void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
819  float sp = row->space_size;
820  float kn = row->kern_size;
821  int16_t reqd_zero_width = 0;
822  int16_t zero_width = 0;
823  int16_t zero_start = 0;
824  int16_t index = 0;
825 
826  if (tosp_debug_level > 10)
827  tprintf ("Improve row threshold 0");
828  if ((all_gap_stats->get_total () <= 25) ||
829  (sp <= 10) ||
830  (sp <= 3 * kn) ||
831  (stats_count_under (all_gap_stats,
832  (int16_t) ceil (kn + (sp - kn) / 3 + 0.5)) <
833  (0.75 * all_gap_stats->get_total ())))
834  return;
835  if (tosp_debug_level > 10)
836  tprintf (" 1");
837  /*
838  Look for the first region of all 0's in the histogram which is wider than
839  max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current
840  threshold is not within it, move the threshold so that is is just inside it.
841  */
842  reqd_zero_width = (int16_t) floor ((sp - kn) / 3 + 0.5);
843  if (reqd_zero_width < 3)
844  reqd_zero_width = 3;
845 
846  for (index = int16_t (ceil (kn)); index < int16_t (floor (sp)); index++) {
847  if (all_gap_stats->pile_count (index) == 0) {
848  if (zero_width == 0)
849  zero_start = index;
850  zero_width++;
851  }
852  else {
853  if (zero_width >= reqd_zero_width)
854  break;
855  else {
856  zero_width = 0;
857  }
858  }
859  }
860  index--;
861  if (tosp_debug_level > 10)
862  tprintf (" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n",
863  reqd_zero_width, zero_width, zero_start, row->space_threshold);
864  if ((zero_width < reqd_zero_width) ||
865  ((row->space_threshold >= zero_start) &&
866  (row->space_threshold <= index)))
867  return;
868  if (tosp_debug_level > 10)
869  tprintf (" 2");
870  if (row->space_threshold < zero_start) {
871  if (tosp_debug_level > 5)
872  tprintf
873  ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
874  kn, sp, zero_start, index, row->space_threshold, zero_start);
875  row->space_threshold = zero_start;
876  }
877  if (row->space_threshold > index) {
878  if (tosp_debug_level > 5)
879  tprintf
880  ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
881  kn, sp, zero_start, index, row->space_threshold, index);
882  row->space_threshold = index;
883  }
884 }
885 
886 
887 /**********************************************************************
888  * make_prop_words
889  *
890  * Convert a TO_BLOCK to a BLOCK.
891  **********************************************************************/
893  TO_ROW *row, // row to make
894  FCOORD rotation // for drawing
895  ) {
896  bool bol; // start of line
897  /* prev_ values are for start of word being built. non prev_ values are for
898  the gap between the word being built and the next one. */
899  bool prev_fuzzy_sp; // probably space
900  bool prev_fuzzy_non; // probably not
901  uint8_t prev_blanks; // in front of word
902  bool fuzzy_sp = false; // probably space
903  bool fuzzy_non = false; // probably not
904  uint8_t blanks = 0; // in front of word
905  bool prev_gap_was_a_space = false;
906  bool break_at_next_gap = false;
907  ROW *real_row; // output row
908  C_OUTLINE_IT cout_it;
909  C_BLOB_LIST cblobs;
910  C_BLOB_IT cblob_it = &cblobs;
911  WERD_LIST words;
912  WERD *word; // new word
913  int32_t next_rep_char_word_right = INT32_MAX;
914  float repetition_spacing; // gap between repetitions
915  int32_t xstarts[2]; // row ends
916  int32_t prev_x; // end of prev blob
917  BLOBNBOX *bblob; // current blob
918  TBOX blob_box; // bounding box
919  BLOBNBOX_IT box_it; // iterator
920  TBOX prev_blob_box;
921  TBOX next_blob_box;
922  int16_t prev_gap = INT16_MAX;
923  int16_t current_gap = INT16_MAX;
924  int16_t next_gap = INT16_MAX;
925  int16_t prev_within_xht_gap = INT16_MAX;
926  int16_t current_within_xht_gap = INT16_MAX;
927  int16_t next_within_xht_gap = INT16_MAX;
928  int16_t word_count = 0;
929 
930  // repeated char words
931  WERD_IT rep_char_it(&(row->rep_words));
932  if (!rep_char_it.empty ()) {
933  next_rep_char_word_right =
934  rep_char_it.data ()->bounding_box ().right ();
935  }
936 
937  prev_x = -INT16_MAX;
938  cblob_it.set_to_list (&cblobs);
939  box_it.set_to_list (row->blob_list ());
940  // new words
941  WERD_IT word_it(&words);
942  bol = true;
943  prev_blanks = 0;
944  prev_fuzzy_sp = false;
945  prev_fuzzy_non = false;
946  if (!box_it.empty ()) {
947  xstarts[0] = box_it.data ()->bounding_box ().left ();
948  if (xstarts[0] > next_rep_char_word_right) {
949  /* We need to insert a repeated char word at the start of the row */
950  word = rep_char_it.extract ();
951  word_it.add_after_then_move (word);
952  /* Set spaces before repeated char word */
953  word->set_flag (W_BOL, true);
954  bol = false;
955  word->set_blanks (0);
956  //NO uncertainty
957  word->set_flag (W_FUZZY_SP, false);
958  word->set_flag (W_FUZZY_NON, false);
959  xstarts[0] = word->bounding_box ().left ();
960  /* Set spaces after repeated char word (and leave current word set) */
961  repetition_spacing = find_mean_blob_spacing (word);
962  current_gap = box_it.data ()->bounding_box ().left () -
963  next_rep_char_word_right;
964  current_within_xht_gap = current_gap;
965  if (current_gap > tosp_rep_space * repetition_spacing) {
966  prev_blanks = (uint8_t) floor (current_gap / row->space_size);
967  if (prev_blanks < 1)
968  prev_blanks = 1;
969  }
970  else
971  prev_blanks = 0;
972  if (tosp_debug_level > 5)
973  tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
974  box_it.data ()->bounding_box ().left (),
975  box_it.data ()->bounding_box ().bottom (),
976  repetition_spacing, current_gap);
977  prev_fuzzy_sp = false;
978  prev_fuzzy_non = false;
979  if (rep_char_it.empty ()) {
980  next_rep_char_word_right = INT32_MAX;
981  }
982  else {
983  rep_char_it.forward ();
984  next_rep_char_word_right =
985  rep_char_it.data ()->bounding_box ().right ();
986  }
987  }
988 
989  peek_at_next_gap(row,
990  box_it,
991  next_blob_box,
992  next_gap,
993  next_within_xht_gap);
994  do {
995  bblob = box_it.data ();
996  blob_box = bblob->bounding_box ();
997  if (bblob->joined_to_prev ()) {
998  if (bblob->cblob () != nullptr) {
999  cout_it.set_to_list (cblob_it.data ()->out_list ());
1000  cout_it.move_to_last ();
1001  cout_it.add_list_after (bblob->cblob ()->out_list ());
1002  delete bblob->cblob ();
1003  }
1004  } else {
1005  if (bblob->cblob() != nullptr)
1006  cblob_it.add_after_then_move (bblob->cblob ());
1007  prev_x = blob_box.right ();
1008  }
1009  box_it.forward (); //next one
1010  bblob = box_it.data ();
1011  blob_box = bblob->bounding_box ();
1012 
1013  if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) {
1014  /* Real Blob - not multiple outlines or pre-chopped */
1015  prev_gap = current_gap;
1016  prev_within_xht_gap = current_within_xht_gap;
1017  prev_blob_box = next_blob_box;
1018  current_gap = next_gap;
1019  current_within_xht_gap = next_within_xht_gap;
1020  peek_at_next_gap(row,
1021  box_it,
1022  next_blob_box,
1023  next_gap,
1024  next_within_xht_gap);
1025 
1026  int16_t prev_gap_arg = prev_gap;
1027  int16_t next_gap_arg = next_gap;
1028  if (tosp_only_use_xht_gaps) {
1029  prev_gap_arg = prev_within_xht_gap;
1030  next_gap_arg = next_within_xht_gap;
1031  }
1032  // Decide if a word-break should be inserted
1033  if (blob_box.left () > next_rep_char_word_right ||
1034  make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box,
1035  current_gap, current_within_xht_gap,
1036  next_blob_box, next_gap_arg,
1037  blanks, fuzzy_sp, fuzzy_non,
1038  prev_gap_was_a_space,
1039  break_at_next_gap) ||
1040  box_it.at_first()) {
1041  /* Form a new word out of the blobs collected */
1042  word = new WERD (&cblobs, prev_blanks, nullptr);
1043  word_count++;
1044  word_it.add_after_then_move (word);
1045  if (bol) {
1046  word->set_flag (W_BOL, true);
1047  bol = false;
1048  }
1049  if (prev_fuzzy_sp)
1050  //probably space
1051  word->set_flag (W_FUZZY_SP, true);
1052  else if (prev_fuzzy_non)
1053  word->set_flag (W_FUZZY_NON, true);
1054  //probably not
1055 
1056  if (blob_box.left () > next_rep_char_word_right) {
1057  /* We need to insert a repeated char word */
1058  word = rep_char_it.extract ();
1059  word_it.add_after_then_move (word);
1060 
1061  /* Set spaces before repeated char word */
1062  repetition_spacing = find_mean_blob_spacing (word);
1063  current_gap = word->bounding_box ().left () - prev_x;
1064  current_within_xht_gap = current_gap;
1065  if (current_gap > tosp_rep_space * repetition_spacing) {
1066  blanks =
1067  (uint8_t) floor (current_gap / row->space_size);
1068  if (blanks < 1)
1069  blanks = 1;
1070  }
1071  else
1072  blanks = 0;
1073  if (tosp_debug_level > 5)
1074  tprintf
1075  ("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
1076  word->bounding_box ().left (),
1077  word->bounding_box ().bottom (),
1078  repetition_spacing, current_gap, blanks);
1079  word->set_blanks (blanks);
1080  //NO uncertainty
1081  word->set_flag (W_FUZZY_SP, false);
1082  word->set_flag (W_FUZZY_NON, false);
1083 
1084  /* Set spaces after repeated char word (and leave current word set) */
1085  current_gap =
1086  blob_box.left () - next_rep_char_word_right;
1087  if (current_gap > tosp_rep_space * repetition_spacing) {
1088  blanks = (uint8_t) (current_gap / row->space_size);
1089  if (blanks < 1)
1090  blanks = 1;
1091  }
1092  else
1093  blanks = 0;
1094  if (tosp_debug_level > 5)
1095  tprintf (" Rgap:%d (%d blanks)\n",
1096  current_gap, blanks);
1097  fuzzy_sp = FALSE;
1098  fuzzy_non = FALSE;
1099 
1100  if (rep_char_it.empty ()) {
1101  next_rep_char_word_right = INT32_MAX;
1102  }
1103  else {
1104  rep_char_it.forward ();
1105  next_rep_char_word_right =
1106  rep_char_it.data ()->bounding_box ().right ();
1107  }
1108  }
1109 
1110  if (box_it.at_first () && rep_char_it.empty ()) {
1111  //at end of line
1112  word->set_flag (W_EOL, true);
1113  xstarts[1] = prev_x;
1114  }
1115  else {
1116  prev_blanks = blanks;
1117  prev_fuzzy_sp = fuzzy_sp;
1118  prev_fuzzy_non = fuzzy_non;
1119  }
1120  }
1121  }
1122  }
1123  while (!box_it.at_first ()); //until back at start
1124 
1125  /* Insert any further repeated char words */
1126  while (!rep_char_it.empty ()) {
1127  word = rep_char_it.extract ();
1128  word_it.add_after_then_move (word);
1129 
1130  /* Set spaces before repeated char word */
1131  repetition_spacing = find_mean_blob_spacing (word);
1132  current_gap = word->bounding_box ().left () - prev_x;
1133  if (current_gap > tosp_rep_space * repetition_spacing) {
1134  blanks = (uint8_t) floor (current_gap / row->space_size);
1135  if (blanks < 1)
1136  blanks = 1;
1137  }
1138  else
1139  blanks = 0;
1140  if (tosp_debug_level > 5)
1141  tprintf(
1142  "Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n",
1143  word->bounding_box().left(), word->bounding_box().bottom(),
1144  repetition_spacing, current_gap, blanks);
1145  word->set_blanks (blanks);
1146  //NO uncertainty
1147  word->set_flag (W_FUZZY_SP, false);
1148  word->set_flag (W_FUZZY_NON, false);
1149  prev_x = word->bounding_box ().right ();
1150  if (rep_char_it.empty ()) {
1151  //at end of line
1152  word->set_flag (W_EOL, true);
1153  xstarts[1] = prev_x;
1154  }
1155  else {
1156  rep_char_it.forward ();
1157  }
1158  }
1159  real_row = new ROW (row,
1160  (int16_t) row->kern_size, (int16_t) row->space_size);
1161  word_it.set_to_list (real_row->word_list ());
1162  //put words in row
1163  word_it.add_list_after (&words);
1164  real_row->recalc_bounding_box ();
1165 
1166  if (tosp_debug_level > 4) {
1167  tprintf ("Row: Made %d words in row ((%d,%d)(%d,%d))\n",
1168  word_count,
1169  real_row->bounding_box ().left (),
1170  real_row->bounding_box ().bottom (),
1171  real_row->bounding_box ().right (),
1172  real_row->bounding_box ().top ());
1173  }
1174  return real_row;
1175  }
1176  return nullptr;
1177 }
1178 
1179 /**********************************************************************
1180  * make_blob_words
1181  *
1182  * Converts words into blobs so that each blob is a single character.
1183  * Used for chopper test.
1184  **********************************************************************/
1186  TO_ROW *row, // row to make
1187  FCOORD rotation // for drawing
1188  ) {
1189  bool bol; // start of line
1190  ROW *real_row; // output row
1191  C_OUTLINE_IT cout_it;
1192  C_BLOB_LIST cblobs;
1193  C_BLOB_IT cblob_it = &cblobs;
1194  WERD_LIST words;
1195  WERD *word; // new word
1196  BLOBNBOX *bblob; // current blob
1197  TBOX blob_box; // bounding box
1198  BLOBNBOX_IT box_it; // iterator
1199  int16_t word_count = 0;
1200 
1201  cblob_it.set_to_list(&cblobs);
1202  box_it.set_to_list(row->blob_list());
1203  // new words
1204  WERD_IT word_it(&words);
1205  bol = TRUE;
1206  if (!box_it.empty()) {
1207 
1208  do {
1209  bblob = box_it.data();
1210  blob_box = bblob->bounding_box();
1211  if (bblob->joined_to_prev()) {
1212  if (bblob->cblob() != nullptr) {
1213  cout_it.set_to_list(cblob_it.data()->out_list());
1214  cout_it.move_to_last();
1215  cout_it.add_list_after(bblob->cblob()->out_list());
1216  delete bblob->cblob();
1217  }
1218  } else {
1219  if (bblob->cblob() != nullptr)
1220  cblob_it.add_after_then_move(bblob->cblob());
1221  }
1222  box_it.forward(); // next one
1223  bblob = box_it.data();
1224  blob_box = bblob->bounding_box();
1225 
1226  if (!bblob->joined_to_prev() && !cblobs.empty()) {
1227  word = new WERD(&cblobs, 1, nullptr);
1228  word_count++;
1229  word_it.add_after_then_move(word);
1230  if (bol) {
1231  word->set_flag(W_BOL, TRUE);
1232  bol = FALSE;
1233  }
1234  if (box_it.at_first()) { // at end of line
1235  word->set_flag(W_EOL, TRUE);
1236  }
1237  }
1238  }
1239  while (!box_it.at_first()); // until back at start
1240  /* Setup the row with created words. */
1241  real_row = new ROW(row, (int16_t) row->kern_size, (int16_t) row->space_size);
1242  word_it.set_to_list(real_row->word_list());
1243  //put words in row
1244  word_it.add_list_after(&words);
1245  real_row->recalc_bounding_box();
1246  if (tosp_debug_level > 4) {
1247  tprintf ("Row:Made %d words in row ((%d,%d)(%d,%d))\n",
1248  word_count,
1249  real_row->bounding_box().left(),
1250  real_row->bounding_box().bottom(),
1251  real_row->bounding_box().right(),
1252  real_row->bounding_box().top());
1253  }
1254  return real_row;
1255  }
1256  return nullptr;
1257 }
1258 
1259 bool Textord::make_a_word_break(
1260  TO_ROW* row, // row being made
1261  TBOX blob_box, // for next_blob // how many blanks?
1262  int16_t prev_gap,
1263  TBOX prev_blob_box,
1264  int16_t real_current_gap,
1265  int16_t within_xht_current_gap,
1266  TBOX next_blob_box,
1267  int16_t next_gap,
1268  uint8_t& blanks,
1269  bool& fuzzy_sp,
1270  bool& fuzzy_non,
1271  bool& prev_gap_was_a_space,
1272  bool& break_at_next_gap) {
1273  bool space;
1274  int16_t current_gap;
1275  float fuzzy_sp_to_kn_limit;
1276 
1277  if (break_at_next_gap) {
1278  break_at_next_gap = false;
1279  return true;
1280  }
1281  /* Inhibit using the reduced gap if
1282  The kerning is large - chars are not kerned and reducing "f"s can cause
1283  erroneous blanks
1284  OR The real gap is less than 0
1285  OR The real gap is less than the kerning estimate
1286  */
1287  if ((row->kern_size > tosp_large_kerning * row->xheight) ||
1289  (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size)))
1290  //Ignore the difference
1291  within_xht_current_gap = real_current_gap;
1292 
1294  current_gap = within_xht_current_gap;
1295  else
1296  current_gap = real_current_gap;
1297 
1298  if (tosp_old_to_method) {
1299  //Boring old method
1300  space = current_gap > row->max_nonspace;
1301  if (space && (current_gap < INT16_MAX)) {
1302  if (current_gap < row->min_space) {
1303  if (current_gap > row->space_threshold) {
1304  blanks = 1;
1305  fuzzy_sp = true;
1306  fuzzy_non = false;
1307  }
1308  else {
1309  blanks = 0;
1310  fuzzy_sp = false;
1311  fuzzy_non = true;
1312  }
1313  }
1314  else {
1315  blanks = (uint8_t) (current_gap / row->space_size);
1316  if (blanks < 1)
1317  blanks = 1;
1318  fuzzy_sp = false;
1319  fuzzy_non = false;
1320  }
1321  }
1322  return space;
1323  }
1324  else {
1325  /* New exciting heuristic method */
1326  if (prev_blob_box.null_box ()) // Beginning of row
1327  prev_gap_was_a_space = true;
1328 
1329  //Default as old TO
1330  space = current_gap > row->space_threshold;
1331 
1332  /* Set defaults for the word break in case we find one. Currently there are
1333  no fuzzy spaces. Depending on the reliability of the different heuristics
1334  we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
1335  be used if the function returns TRUE - ie the word is to be broken.
1336  */
1337  int num_blanks = current_gap;
1338  if (row->space_size > 1.0f)
1339  num_blanks = IntCastRounded(current_gap / row->space_size);
1340  blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX));
1341  fuzzy_sp = false;
1342  fuzzy_non = false;
1343  /*
1344  If xht measure causes gap to flip one of the 3 thresholds act accordingly -
1345  despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
1346  context.
1347  */
1348  if (tosp_use_xht_gaps &&
1349  (real_current_gap <= row->max_nonspace) &&
1350  (within_xht_current_gap > row->max_nonspace)) {
1351  space = true;
1352  fuzzy_non = true;
1353 #ifndef GRAPHICS_DISABLED
1354  mark_gap (blob_box, 20,
1355  prev_gap, prev_blob_box.width (),
1356  current_gap, next_blob_box.width (), next_gap);
1357 #endif
1358  }
1359  else if (tosp_use_xht_gaps &&
1360  (real_current_gap <= row->space_threshold) &&
1361  (within_xht_current_gap > row->space_threshold)) {
1362  space = true;
1364  fuzzy_sp = true;
1365  else
1366  fuzzy_non = true;
1367 #ifndef GRAPHICS_DISABLED
1368  mark_gap (blob_box, 21,
1369  prev_gap, prev_blob_box.width (),
1370  current_gap, next_blob_box.width (), next_gap);
1371 #endif
1372  }
1373  else if (tosp_use_xht_gaps &&
1374  (real_current_gap < row->min_space) &&
1375  (within_xht_current_gap >= row->min_space)) {
1376  space = true;
1377 #ifndef GRAPHICS_DISABLED
1378  mark_gap (blob_box, 22,
1379  prev_gap, prev_blob_box.width (),
1380  current_gap, next_blob_box.width (), next_gap);
1381 #endif
1382  }
1383  else if (tosp_force_wordbreak_on_punct &&
1384  !suspected_punct_blob(row, prev_blob_box) &&
1385  suspected_punct_blob(row, blob_box)) {
1386  break_at_next_gap = true;
1387  }
1388  /* Now continue with normal heuristics */
1389  else if ((current_gap < row->min_space) &&
1390  (current_gap > row->space_threshold)) {
1391  /* Heuristics to turn dubious spaces to kerns */
1393  fuzzy_sp_to_kn_limit = row->kern_size +
1395  (row->space_size - row->kern_size);
1396  else
1397  fuzzy_sp_to_kn_limit = 99999.0f;
1398 
1399  /* If current gap is significantly smaller than the previous space the other
1400  side of a narrow blob then this gap is a kern. */
1401  if ((prev_blob_box.width () > 0) &&
1402  narrow_blob (row, prev_blob_box) &&
1403  prev_gap_was_a_space &&
1404  (current_gap <= tosp_gap_factor * prev_gap)) {
1405  if ((tosp_all_flips_fuzzy) ||
1406  (current_gap > fuzzy_sp_to_kn_limit)) {
1408  fuzzy_non = true;
1409  else
1410  fuzzy_sp = true;
1411  }
1412  else
1413  space = false;
1414 #ifndef GRAPHICS_DISABLED
1415  mark_gap (blob_box, 1,
1416  prev_gap, prev_blob_box.width (),
1417  current_gap, next_blob_box.width (), next_gap);
1418 #endif
1419  }
1420  /* If current gap not much bigger than the previous kern the other side of a
1421  narrow blob then this gap is a kern as well */
1422  else if ((prev_blob_box.width () > 0) &&
1423  narrow_blob (row, prev_blob_box) &&
1424  !prev_gap_was_a_space &&
1425  (current_gap * tosp_gap_factor <= prev_gap)) {
1426  if ((tosp_all_flips_fuzzy) ||
1427  (current_gap > fuzzy_sp_to_kn_limit)) {
1429  fuzzy_non = true;
1430  else
1431  fuzzy_sp = true;
1432  }
1433  else
1434  space = false;
1435 #ifndef GRAPHICS_DISABLED
1436  mark_gap (blob_box, 2,
1437  prev_gap, prev_blob_box.width (),
1438  current_gap, next_blob_box.width (), next_gap);
1439 #endif
1440  }
1441  else if ((next_blob_box.width () > 0) &&
1442  narrow_blob (row, next_blob_box) &&
1443  (next_gap > row->space_threshold) &&
1444  (current_gap <= tosp_gap_factor * next_gap)) {
1445  if ((tosp_all_flips_fuzzy) ||
1446  (current_gap > fuzzy_sp_to_kn_limit)) {
1448  fuzzy_non = true;
1449  else
1450  fuzzy_sp = true;
1451  }
1452  else
1453  space = false;
1454 #ifndef GRAPHICS_DISABLED
1455  mark_gap (blob_box, 3,
1456  prev_gap, prev_blob_box.width (),
1457  current_gap, next_blob_box.width (), next_gap);
1458 #endif
1459  }
1460  else if ((next_blob_box.width () > 0) &&
1461  narrow_blob (row, next_blob_box) &&
1462  (next_gap <= row->space_threshold) &&
1463  (current_gap * tosp_gap_factor <= next_gap)) {
1464  if ((tosp_all_flips_fuzzy) ||
1465  (current_gap > fuzzy_sp_to_kn_limit)) {
1467  fuzzy_non = true;
1468  else
1469  fuzzy_sp = true;
1470  }
1471  else
1472  space = false;
1473 #ifndef GRAPHICS_DISABLED
1474  mark_gap (blob_box, 4,
1475  prev_gap, prev_blob_box.width (),
1476  current_gap, next_blob_box.width (), next_gap);
1477 #endif
1478  }
1479  else if ((((next_blob_box.width () > 0) &&
1480  narrow_blob (row, next_blob_box)) ||
1481  ((prev_blob_box.width () > 0) &&
1482  narrow_blob (row, prev_blob_box)))) {
1483  fuzzy_sp = true;
1484 #ifndef GRAPHICS_DISABLED
1485  mark_gap (blob_box, 6,
1486  prev_gap, prev_blob_box.width (),
1487  current_gap, next_blob_box.width (), next_gap);
1488 #endif
1489  }
1490  }
1491  else if ((current_gap > row->max_nonspace) &&
1492  (current_gap <= row->space_threshold)) {
1493 
1494  /* Heuristics to turn dubious kerns to spaces */
1495  /* TRIED THIS BUT IT MADE THINGS WORSE
1496  if (prev_gap == INT16_MAX)
1497  prev_gap = 0; // start of row
1498  if (next_gap == INT16_MAX)
1499  next_gap = 0; // end of row
1500  */
1501  if ((prev_blob_box.width () > 0) &&
1502  (next_blob_box.width () > 0) &&
1503  (current_gap >=
1504  tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) &&
1505  wide_blob (row, prev_blob_box) &&
1506  wide_blob (row, next_blob_box)) {
1507 
1508  space = true;
1509  /*
1510  tosp_flip_caution is an attempt to stop the default changing in cases
1511  where there is a large difference between the kern and space estimates.
1512  See problem in 'chiefs' where "have" gets split in the quotation.
1513  */
1514  if ((tosp_flip_fuzz_kn_to_sp) &&
1515  ((tosp_flip_caution <= 0) ||
1516  (tosp_flip_caution * row->kern_size > row->space_size)))
1517  fuzzy_sp = true;
1518  else
1519  fuzzy_non = true;
1520 #ifndef GRAPHICS_DISABLED
1521  mark_gap (blob_box, 7,
1522  prev_gap, prev_blob_box.width (),
1523  current_gap, next_blob_box.width (), next_gap);
1524 #endif
1525  } else if (prev_blob_box.width() > 0 &&
1526  next_blob_box.width() > 0 &&
1527  current_gap > 5 && // Rule 9 handles small gap, big ratio.
1528  current_gap >=
1529  tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) &&
1530  !(narrow_blob(row, prev_blob_box) ||
1531  suspected_punct_blob(row, prev_blob_box)) &&
1532  !(narrow_blob(row, next_blob_box) ||
1533  suspected_punct_blob(row, next_blob_box))) {
1534  space = true;
1535  fuzzy_non = true;
1536 #ifndef GRAPHICS_DISABLED
1537  mark_gap (blob_box, 8,
1538  prev_gap, prev_blob_box.width (),
1539  current_gap, next_blob_box.width (), next_gap);
1540 #endif
1541  }
1542  else if ((tosp_kern_gap_factor3 > 0) &&
1543  (prev_blob_box.width () > 0) &&
1544  (next_blob_box.width () > 0) &&
1545  (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) &&
1547  (!suspected_punct_blob (row, prev_blob_box) &&
1548  !suspected_punct_blob (row, next_blob_box)))) {
1549  space = true;
1550  fuzzy_non = true;
1551 #ifndef GRAPHICS_DISABLED
1552  mark_gap (blob_box, 9,
1553  prev_gap, prev_blob_box.width (),
1554  current_gap, next_blob_box.width (), next_gap);
1555 #endif
1556  }
1557  }
1558  if (tosp_debug_level > 10)
1559  tprintf("word break = %d current_gap = %d, prev_gap = %d, "
1560  "next_gap = %d\n", space ? 1 : 0, current_gap,
1561  prev_gap, next_gap);
1562  prev_gap_was_a_space = space && !(fuzzy_non);
1563  return space;
1564  }
1565 }
1566 
1567 bool Textord::narrow_blob(TO_ROW* row, TBOX blob_box) {
1568  bool result;
1569  result = ((blob_box.width () <= tosp_narrow_fraction * row->xheight) ||
1570  (((float) blob_box.width () / blob_box.height ()) <=
1572  return result;
1573 }
1574 
1575 bool Textord::wide_blob(TO_ROW* row, TBOX blob_box) {
1576  bool result;
1577  if (tosp_wide_fraction > 0) {
1578  if (tosp_wide_aspect_ratio > 0)
1579  result = ((blob_box.width () >= tosp_wide_fraction * row->xheight) &&
1580  (((float) blob_box.width () / blob_box.height ()) >
1582  else
1583  result = (blob_box.width () >= tosp_wide_fraction * row->xheight);
1584  }
1585  else
1586  result = !narrow_blob (row, blob_box);
1587  return result;
1588 }
1589 
1590 bool Textord::suspected_punct_blob(TO_ROW* row, TBOX box) {
1591  bool result;
1592  float baseline;
1593  float blob_x_centre;
1594  /* Find baseline of centre of blob */
1595  blob_x_centre = (box.right () + box.left ()) / 2.0;
1596  baseline = row->baseline.y (blob_x_centre);
1597 
1598  result = (box.height () <= 0.66 * row->xheight) ||
1599  (box.top () < baseline + row->xheight / 2.0) ||
1600  (box.bottom () > baseline + row->xheight / 2.0);
1601  return result;
1602 }
1603 
1604 
1605 void Textord::peek_at_next_gap(TO_ROW *row,
1606  BLOBNBOX_IT box_it,
1607  TBOX &next_blob_box,
1608  int16_t &next_gap,
1609  int16_t &next_within_xht_gap) {
1610  TBOX next_reduced_blob_box;
1611  TBOX bit_beyond;
1612  BLOBNBOX_IT reduced_box_it = box_it;
1613 
1614  next_blob_box = box_next (&box_it);
1615  next_reduced_blob_box = reduced_box_next (row, &reduced_box_it);
1616  if (box_it.at_first ()) {
1617  next_gap = INT16_MAX;
1618  next_within_xht_gap = INT16_MAX;
1619  }
1620  else {
1621  bit_beyond = box_it.data ()->bounding_box ();
1622  next_gap = bit_beyond.left () - next_blob_box.right ();
1623  bit_beyond = reduced_box_next (row, &reduced_box_it);
1624  next_within_xht_gap =
1625  bit_beyond.left () - next_reduced_blob_box.right ();
1626  }
1627 }
1628 
1629 
1630 #ifndef GRAPHICS_DISABLED
1631 void Textord::mark_gap(
1632  TBOX blob, // blob following gap
1633  int16_t rule, // heuristic id
1634  int16_t prev_gap,
1635  int16_t prev_blob_width,
1636  int16_t current_gap,
1637  int16_t next_blob_width,
1638  int16_t next_gap) {
1639  ScrollView::Color col; //of ellipse marking flipped gap
1640 
1641  switch (rule) {
1642  case 1:
1643  col = ScrollView::RED;
1644  break;
1645  case 2:
1646  col = ScrollView::CYAN;
1647  break;
1648  case 3:
1649  col = ScrollView::GREEN;
1650  break;
1651  case 4:
1652  col = ScrollView::BLACK;
1653  break;
1654  case 5:
1655  col = ScrollView::MAGENTA;
1656  break;
1657  case 6:
1658  col = ScrollView::BLUE;
1659  break;
1660 
1661  case 7:
1662  col = ScrollView::WHITE;
1663  break;
1664  case 8:
1665  col = ScrollView::YELLOW;
1666  break;
1667  case 9:
1668  col = ScrollView::BLACK;
1669  break;
1670 
1671  case 20:
1672  col = ScrollView::CYAN;
1673  break;
1674  case 21:
1675  col = ScrollView::GREEN;
1676  break;
1677  case 22:
1678  col = ScrollView::MAGENTA;
1679  break;
1680  default:
1681  col = ScrollView::BLACK;
1682  }
1684  to_win->Pen(col);
1685  /* if (rule < 20)
1686  //interior_style(to_win, INT_SOLID, FALSE);
1687  else
1688  //interior_style(to_win, INT_HOLLOW, TRUE);*/
1689  //x radius
1690  to_win->Ellipse (current_gap / 2.0f,
1691  blob.height () / 2.0f, //y radius
1692  //x centre
1693  blob.left () - current_gap / 2.0f,
1694  //y centre
1695  blob.bottom () + blob.height () / 2.0f);
1696  }
1697  if (tosp_debug_level > 5)
1698  tprintf(" (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n",
1699  blob.left() - current_gap / 2, blob.bottom(), rule, prev_gap,
1700  prev_blob_width, current_gap, next_blob_width, next_gap);
1701 }
1702 #endif
1703 
1704 float Textord::find_mean_blob_spacing(WERD *word) {
1705  C_BLOB_IT cblob_it;
1706  TBOX blob_box;
1707  int32_t gap_sum = 0;
1708  int16_t gap_count = 0;
1709  int16_t prev_right;
1710 
1711  cblob_it.set_to_list (word->cblob_list ());
1712  if (!cblob_it.empty ()) {
1713  cblob_it.mark_cycle_pt ();
1714  prev_right = cblob_it.data ()->bounding_box ().right ();
1715  //first blob
1716  cblob_it.forward ();
1717  for (; !cblob_it.cycled_list (); cblob_it.forward ()) {
1718  blob_box = cblob_it.data ()->bounding_box ();
1719  gap_sum += blob_box.left () - prev_right;
1720  gap_count++;
1721  prev_right = blob_box.right ();
1722  }
1723  }
1724  if (gap_count > 0)
1725  return (gap_sum / (float) gap_count);
1726  else
1727  return 0.0f;
1728 }
1729 
1730 
1731 bool Textord::ignore_big_gap(TO_ROW* row,
1732  int32_t row_length,
1733  GAPMAP* gapmap,
1734  int16_t left,
1735  int16_t right) {
1736  int16_t gap = right - left + 1;
1737 
1738  if (tosp_ignore_big_gaps > 999) return FALSE; // Don't ignore
1739  if (tosp_ignore_big_gaps > 0)
1740  return (gap > tosp_ignore_big_gaps * row->xheight);
1741  if (gap > tosp_ignore_very_big_gaps * row->xheight)
1742  return true;
1743  if (tosp_ignore_big_gaps == 0) {
1744  if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight))
1745  return true;
1746  if ((gap > 1.75 * row->xheight) &&
1747  ((row_length > 35 * row->xheight) ||
1748  gapmap->table_gap (left, right)))
1749  return true;
1750  }
1751  else {
1752  /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table */
1753  if ((gap > gapmap_big_gaps * row->xheight) &&
1754  gapmap->table_gap (left, right))
1755  return true;
1756  }
1757  return false;
1758 }
1759 
1760 /**********************************************************************
1761  * reduced_box_next
1762  *
1763  * Compute the bounding box of this blob with merging of x overlaps
1764  * but no pre-chopping.
1765  * Then move the iterator on to the start of the next blob.
1766  * DON'T reduce the box for small things - eg punctuation.
1767  **********************************************************************/
1768 TBOX Textord::reduced_box_next(
1769  TO_ROW *row, // current row
1770  BLOBNBOX_IT *it // iterator to blobds
1771  ) {
1772  BLOBNBOX *blob; //current blob
1773  BLOBNBOX *head_blob; //place to store box
1774  TBOX full_box; //full blob boundg box
1775  TBOX reduced_box; //box of significant part
1776  int16_t left_above_xht; //ABOVE xht left limit
1777  int16_t new_left_above_xht; //ABOVE xht left limit
1778 
1779  blob = it->data ();
1780  if (blob->red_box_set ()) {
1781  reduced_box = blob->reduced_box ();
1782  do {
1783  it->forward();
1784  blob = it->data();
1785  }
1786  while (blob->cblob() == nullptr || blob->joined_to_prev());
1787  return reduced_box;
1788  }
1789  head_blob = blob;
1790  full_box = blob->bounding_box ();
1791  reduced_box = reduced_box_for_blob (blob, row, &left_above_xht);
1792  do {
1793  it->forward ();
1794  blob = it->data ();
1795  if (blob->cblob() == nullptr)
1796  //was pre-chopped
1797  full_box += blob->bounding_box ();
1798  else if (blob->joined_to_prev ()) {
1799  reduced_box +=
1800  reduced_box_for_blob(blob, row, &new_left_above_xht);
1801  left_above_xht = std::min(left_above_xht, new_left_above_xht);
1802  }
1803  }
1804  //until next real blob
1805  while (blob->cblob() == nullptr || blob->joined_to_prev());
1806 
1807  if ((reduced_box.width () > 0) &&
1808  ((reduced_box.left () + tosp_near_lh_edge * reduced_box.width ())
1809  < left_above_xht) && (reduced_box.height () > 0.7 * row->xheight)) {
1810 #ifndef GRAPHICS_DISABLED
1813 #endif
1814  }
1815  else
1816  reduced_box = full_box;
1817  head_blob->set_reduced_box (reduced_box);
1818  return reduced_box;
1819 }
1820 
1821 
1822 /*************************************************************************
1823  * reduced_box_for_blob()
1824  * Find box for blob which is the same height and y position as the whole blob,
1825  * but whose left limit is the left most position of the blob ABOVE the
1826  * baseline and whose right limit is the right most position of the blob BELOW
1827  * the xheight.
1828  *
1829  *
1830  * !!!!!!! WONT WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
1831  * "home". Perhaps we need something which say if the width ABOVE the
1832  * xht alone includes the whole of the reduced width, then use the full
1833  * blob box - Might still fail on italic F
1834  *
1835  * Alternatively we could be a little less severe and only reduce the
1836  * left and right edges by half the difference between the full box and
1837  * the reduced box.
1838  *
1839  * NOTE that we need to rotate all the coordinates as
1840  * find_blob_limits finds the y min and max within a specified x band
1841  *************************************************************************/
1842 TBOX Textord::reduced_box_for_blob(
1843  BLOBNBOX *blob,
1844  TO_ROW *row,
1845  int16_t *left_above_xht) {
1846  float baseline;
1847  float blob_x_centre;
1848  float left_limit;
1849  float right_limit;
1850  float junk;
1851  TBOX blob_box;
1852 
1853  /* Find baseline of centre of blob */
1854 
1855  blob_box = blob->bounding_box ();
1856  blob_x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
1857  baseline = row->baseline.y (blob_x_centre);
1858 
1859  /*
1860  Find LH limit of blob ABOVE the xht. This is so that we can detect certain
1861  caps ht chars which should NOT have their box reduced: T, Y, V, W etc
1862  */
1863  left_limit = (float) INT32_MAX;
1864  junk = (float) -INT32_MAX;
1865  find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight),
1866  static_cast<float>(INT16_MAX), left_limit, junk);
1867  if (left_limit > junk)
1868  *left_above_xht = INT16_MAX; //No area above xht
1869  else
1870  *left_above_xht = (int16_t) floor (left_limit);
1871  /*
1872  Find reduced LH limit of blob - the left extent of the region ABOVE the
1873  baseline.
1874  */
1875  left_limit = (float) INT32_MAX;
1876  junk = (float) -INT32_MAX;
1877  find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX),
1878  left_limit, junk);
1879 
1880  if (left_limit > junk)
1881  return TBOX (); //no area within xht so return empty box
1882  /*
1883  Find reduced RH limit of blob - the right extent of the region BELOW the xht.
1884  */
1885  junk = (float) INT32_MAX;
1886  right_limit = (float) -INT32_MAX;
1887  find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX),
1888  (baseline + row->xheight), junk, right_limit);
1889  if (junk > right_limit)
1890  return TBOX (); //no area within xht so return empty box
1891 
1892  return TBOX (ICOORD ((int16_t) floor (left_limit), blob_box.bottom ()),
1893  ICOORD ((int16_t) ceil (right_limit), blob_box.top ()));
1894 }
1895 } // namespace tesseract
QSPLINE baseline
Definition: blobbox.h:683
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
Definition: blobbox.cpp:666
int tosp_enough_space_samples_for_median
Definition: textord.h:304
double tosp_threshold_bias2
Definition: textord.h:318
bool table_gap(int16_t left, int16_t right)
Definition: gap_map.cpp:161
bool tosp_row_use_cert_spaces1
Definition: textord.h:283
Definition: gap_map.h:16
bool tosp_block_use_cert_spaces
Definition: textord.h:277
int32_t pile_count(int32_t value) const
Definition: statistc.h:78
double tosp_gap_factor
Definition: textord.h:332
double tosp_wide_fraction
Definition: textord.h:323
bool tosp_only_small_gaps_for_kern
Definition: textord.h:286
double tosp_fuzzy_sp_fraction
Definition: textord.h:351
#define TRUE
Definition: capi.h:51
double tosp_ignore_big_gaps
Definition: textord.h:339
bool tosp_stats_use_xht_gaps
Definition: textord.h:291
const TBOX & reduced_box() const
Definition: blobbox.h:247
double tosp_pass_wide_fuzz_sp_to_context
Definition: textord.h:371
double tosp_old_sp_kn_th_factor
Definition: textord.h:314
bool null_box() const
Definition: rect.h:50
double tosp_dont_fool_with_small_kerns
Definition: textord.h:365
double tosp_ignore_very_big_gaps
Definition: textord.h:340
double tosp_rep_space
Definition: textord.h:341
double tosp_enough_small_gaps
Definition: textord.h:343
float fixed_pitch
Definition: blobbox.h:664
TBOX bounding_box() const
Definition: werd.cpp:159
WERD_LIST rep_words
Definition: blobbox.h:681
double gapmap_big_gaps
Definition: gap_map.cpp:18
Definition: rect.h:34
WERD_LIST * word_list()
Definition: ocrrow.h:55
double tosp_large_kerning
Definition: textord.h:363
double tosp_threshold_bias1
Definition: textord.h:316
void plot_word_decisions(ScrollView *win, int16_t pitch, TO_ROW *row)
Definition: drawtord.cpp:249
Definition: werd.h:35
bool tosp_flip_fuzz_kn_to_sp
Definition: textord.h:298
float space_size
Definition: blobbox.h:680
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:127
bool tosp_narrow_blobs_not_cert
Definition: textord.h:281
Definition: statistc.h:33
double y(double x) const
Definition: quspline.cpp:209
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:637
double tosp_near_lh_edge
Definition: textord.h:367
TO_ROW_LIST * get_rows()
Definition: blobbox.h:717
double tosp_kern_gap_factor1
Definition: textord.h:334
double tosp_min_sane_kn_sp
Definition: textord.h:353
double tosp_table_kn_sp_ratio
Definition: textord.h:345
void Ellipse(int x, int y, int width, int height)
Definition: scrollview.cpp:611
#define MAXSPACING
Definition: tospace.cpp:41
int16_t width() const
Definition: rect.h:115
void set_reduced_box(TBOX new_box)
Definition: blobbox.h:250
float xheight
Definition: blobbox.h:670
int16_t left() const
Definition: rect.h:72
void plot(ScrollView *fd) const
Definition: rect.h:286
bool tosp_rule_9_test_punct
Definition: textord.h:297
int16_t top() const
Definition: rect.h:58
bool tosp_old_to_bug_fix
Definition: textord.h:275
double median() const
Definition: statistc.cpp:238
bool red_box_set() const
Definition: blobbox.h:260
double tosp_fuzzy_kn_fraction
Definition: textord.h:350
integer coordinate
Definition: points.h:32
#define FALSE
Definition: capi.h:52
double mean() const
Definition: statistc.cpp:134
bool tosp_only_use_prop_rows
Definition: textord.h:268
double tosp_table_fuzzy_kn_sp_ratio
Definition: textord.h:349
float kern_size
Definition: blobbox.h:679
double tosp_kern_gap_factor3
Definition: textord.h:338
bool joined_to_prev() const
Definition: blobbox.h:257
int IntCastRounded(double x)
Definition: helpers.h:168
void find_cblob_hlimits(C_BLOB *blob, float bottomy, float topy, float &xmin, float &xmax)
Definition: blobbox.cpp:577
void set_blanks(uint8_t new_blanks)
Definition: werd.h:105
double tosp_fuzzy_space_factor
Definition: textord.h:327
bool tosp_only_use_xht_gaps
Definition: textord.h:295
Definition: werd.h:59
TBOX bounding_box() const
Definition: ocrrow.h:88
Definition: ocrrow.h:36
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: werd.h:34
double tosp_table_xht_sp_ratio
Definition: textord.h:347
int32_t space_threshold
Definition: blobbox.h:678
ROW * make_blob_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:1185
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
double tosp_fuzzy_space_factor2
Definition: textord.h:331
double tosp_init_guess_kn_mult
Definition: textord.h:355
bool tosp_flip_fuzz_sp_to_kn
Definition: textord.h:299
void add(int32_t value, int32_t count)
Definition: statistc.cpp:100
EXTERN ScrollView * to_win
Definition: drawtord.cpp:37
EXTERN bool textord_show_initial_words
Definition: tovars.cpp:25
bool tosp_row_use_cert_spaces
Definition: textord.h:279
bool tosp_fuzzy_limit_all
Definition: textord.h:289
void recalc_bounding_box()
Definition: ocrrow.cpp:101
int tosp_redo_kern_limit
Definition: textord.h:306
bool tosp_recovery_isolated_row_stats
Definition: textord.h:285
double tosp_narrow_aspect_ratio
Definition: textord.h:322
int tosp_sanity_method
Definition: textord.h:311
double tosp_max_sane_kn_thresh
Definition: textord.h:359
bool tosp_use_pre_chopping
Definition: textord.h:273
ROW * make_prop_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:892
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70
Definition: points.h:189
const TBOX & bounding_box() const
Definition: blobbox.h:231
bool tosp_old_to_method
Definition: textord.h:263
int32_t min_space
Definition: blobbox.h:676
int16_t right() const
Definition: rect.h:79
double tosp_kern_gap_factor2
Definition: textord.h:336
double tosp_narrow_fraction
Definition: textord.h:320
int32_t max_nonspace
Definition: blobbox.h:677
double tosp_silly_kn_sp_gap
Definition: textord.h:369
double tosp_fuzzy_space_factor1
Definition: textord.h:329
bool tosp_force_wordbreak_on_punct
Definition: textord.h:271
void Pen(Color color)
Definition: scrollview.cpp:722
int16_t bottom() const
Definition: rect.h:65
double tosp_wide_aspect_ratio
Definition: textord.h:325
bool tosp_improve_thresh
Definition: textord.h:301
double tosp_init_guess_xht_mult
Definition: textord.h:357
int16_t height() const
Definition: rect.h:108
C_BLOB * cblob() const
Definition: blobbox.h:269
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:612
int32_t get_total() const
Definition: statistc.h:86
bool tosp_use_xht_gaps
Definition: textord.h:293
double tosp_flip_caution
Definition: textord.h:361
bool tosp_all_flips_fuzzy
Definition: textord.h:287
void to_spacing(ICOORD page_tr, TO_BLOCK_LIST *blocks)
Definition: tospace.cpp:44
PITCH_TYPE pitch_decision
Definition: blobbox.h:663
bool tosp_old_to_constrain_sp_kn
Definition: textord.h:266