tesseract  5.0.0-alpha-619-ge9db
tospace.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 /**********************************************************************
11  * tospace.cpp
12  *
13  * Compute fuzzy word spacing thresholds for each row.
14  * I.e. set : max_nonspace
15  * space_threshold
16  * min_space
17  * kern_size
18  * space_size
19  * for each row.
20  * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
21  *
22  * Note: functions in this file were originally not members of any
23  * class or enclosed by any namespace. Now they are all static members
24  * of the Textord class.
25  *
26  **********************************************************************/
27 
28 #include "drawtord.h"
29 #include "statistc.h"
30 #include "textord.h"
31 #include "tovars.h"
32 
33 // Include automatically generated configuration file if running autoconf.
34 #ifdef HAVE_CONFIG_H
35 #include "config_auto.h"
36 #endif
37 
38 #include <algorithm>
39 #include <memory>
40 
41 #define MAXSPACING 128 /*max expected spacing in pix */
42 
43 namespace tesseract {
45  ICOORD page_tr, //topright of page
46  TO_BLOCK_LIST *blocks //blocks on page
47  ) {
48  TO_BLOCK_IT block_it; //iterator
49  TO_BLOCK *block; //current block;
50  TO_ROW *row; //current row
51  int block_index; //block number
52  int row_index; //row number
53  //estimated width of real spaces for whole block
54  int16_t block_space_gap_width;
55  //estimated width of non space gaps for whole block
56  int16_t block_non_space_gap_width;
57  bool old_text_ord_proportional;//old fixed/prop result
58 
59  block_it.set_to_list (blocks);
60  block_index = 1;
61  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
62  block_it.forward ()) {
63  block = block_it.data ();
64  std::unique_ptr<GAPMAP> gapmap(new GAPMAP (block)); //map of big vert gaps in blk
65  block_spacing_stats(block,
66  gapmap.get(),
67  old_text_ord_proportional,
68  block_space_gap_width,
69  block_non_space_gap_width);
70  // Make sure relative values of block-level space and non-space gap
71  // widths are reasonable. The ratio of 1:3 is also used in
72  // block_spacing_stats, to corrrect the block_space_gap_width
73  // Useful for arabic and hindi, when the non-space gap width is
74  // often over-estimated and should not be trusted. A similar ratio
75  // is found in block_spacing_stats.
77  static_cast<float>(block_space_gap_width) / block_non_space_gap_width < 3.0) {
78  block_non_space_gap_width = static_cast<int16_t>(floor (block_space_gap_width / 3.0));
79  }
80  // row iterator
81  TO_ROW_IT row_it(block->get_rows());
82  row_index = 1;
83  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
84  row = row_it.data ();
85  if ((row->pitch_decision == PITCH_DEF_PROP) ||
86  (row->pitch_decision == PITCH_CORR_PROP)) {
87  if ((tosp_debug_level > 0) && !old_text_ord_proportional)
88  tprintf ("Block %d Row %d: Now Proportional\n",
89  block_index, row_index);
90  row_spacing_stats(row,
91  gapmap.get(),
92  block_index,
93  row_index,
94  block_space_gap_width,
95  block_non_space_gap_width);
96  }
97  else {
98  if ((tosp_debug_level > 0) && old_text_ord_proportional)
99  tprintf
100  ("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n",
101  block_index, row_index, row->pitch_decision,
102  row->fixed_pitch);
103  }
104 #ifndef GRAPHICS_DISABLED
106  plot_word_decisions (to_win, static_cast<int16_t>(row->fixed_pitch), row);
107 #endif
108  row_index++;
109  }
110  block_index++;
111  }
112 }
113 
114 
115 /*************************************************************************
116  * block_spacing_stats()
117  *************************************************************************/
118 
119 void Textord::block_spacing_stats(
120  TO_BLOCK* block,
121  GAPMAP* gapmap,
122  bool& old_text_ord_proportional,
123  int16_t& block_space_gap_width, // resulting estimate
124  int16_t& block_non_space_gap_width // resulting estimate
125 ) {
126  TO_ROW *row; // current row
127  BLOBNBOX_IT blob_it; // iterator
128 
129  STATS centre_to_centre_stats (0, MAXSPACING);
130  // DEBUG USE ONLY
131  STATS all_gap_stats (0, MAXSPACING);
132  STATS space_gap_stats (0, MAXSPACING);
133  int16_t minwidth = MAXSPACING; // narrowest blob
134  TBOX blob_box;
135  TBOX prev_blob_box;
136  int16_t centre_to_centre;
137  int16_t gap_width;
138  float real_space_threshold;
139  float iqr_centre_to_centre; // DEBUG USE ONLY
140  float iqr_all_gap_stats; // DEBUG USE ONLY
141  int32_t end_of_row;
142  int32_t row_length;
143 
144  // row iterator
145  TO_ROW_IT row_it(block->get_rows());
146  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
147  row = row_it.data ();
148  if (!row->blob_list ()->empty () &&
150  (row->pitch_decision == PITCH_DEF_PROP) ||
151  (row->pitch_decision == PITCH_CORR_PROP))) {
152  blob_it.set_to_list (row->blob_list ());
153  blob_it.mark_cycle_pt ();
154  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
156  blob_box = box_next_pre_chopped (&blob_it);
157  else if (tosp_stats_use_xht_gaps)
158  blob_box = reduced_box_next (row, &blob_it);
159  else
160  blob_box = box_next (&blob_it);
161  row_length = end_of_row - blob_box.left ();
162  if (blob_box.width () < minwidth)
163  minwidth = blob_box.width ();
164  prev_blob_box = blob_box;
165  while (!blob_it.cycled_list ()) {
167  blob_box = box_next_pre_chopped (&blob_it);
168  else if (tosp_stats_use_xht_gaps)
169  blob_box = reduced_box_next (row, &blob_it);
170  else
171  blob_box = box_next (&blob_it);
172  if (blob_box.width () < minwidth)
173  minwidth = blob_box.width ();
174  int16_t left = prev_blob_box.right();
175  int16_t right = blob_box.left();
176  gap_width = right - left;
177  if (!ignore_big_gap(row, row_length, gapmap, left, right)) {
178  all_gap_stats.add (gap_width, 1);
179 
180  centre_to_centre = (right + blob_box.right () -
181  (prev_blob_box.left () + left)) / 2;
182  //DEBUG
183  centre_to_centre_stats.add (centre_to_centre, 1);
184  // DEBUG
185  }
186  prev_blob_box = blob_box;
187  }
188  }
189  }
190 
191  //Inadequate samples
192  if (all_gap_stats.get_total () <= 1) {
193  block_non_space_gap_width = minwidth;
194  block_space_gap_width = -1; //No est. space width
195  //DEBUG
196  old_text_ord_proportional = true;
197  }
198  else {
199  /* For debug only ..... */
200  iqr_centre_to_centre = centre_to_centre_stats.ile (0.75) -
201  centre_to_centre_stats.ile (0.25);
202  iqr_all_gap_stats = all_gap_stats.ile (0.75) - all_gap_stats.ile (0.25);
203  old_text_ord_proportional =
204  iqr_centre_to_centre * 2 > iqr_all_gap_stats;
205  /* .......For debug only */
206 
207  /*
208  The median of the gaps is used as an estimate of the NON-SPACE gap width.
209  This RELIES on the assumption that there are more gaps WITHIN words than
210  BETWEEN words in a block
211 
212  Now try to estimate the width of a real space for all real spaces in the
213  block. Do this by using a crude threshold to ignore "narrow" gaps, then
214  find the median of the "wide" gaps and use this.
215  */
216  block_non_space_gap_width = static_cast<int16_t>(floor (all_gap_stats.median ()));
217  // median gap
218 
219  row_it.set_to_list (block->get_rows ());
220  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
221  row = row_it.data ();
222  if (!row->blob_list ()->empty () &&
224  (row->pitch_decision == PITCH_DEF_PROP) ||
225  (row->pitch_decision == PITCH_CORR_PROP))) {
226  real_space_threshold =
227  std::max (tosp_init_guess_kn_mult * block_non_space_gap_width,
229  blob_it.set_to_list (row->blob_list ());
230  blob_it.mark_cycle_pt ();
231  end_of_row =
232  blob_it.data_relative (-1)->bounding_box ().right ();
234  blob_box = box_next_pre_chopped (&blob_it);
235  else if (tosp_stats_use_xht_gaps)
236  blob_box = reduced_box_next (row, &blob_it);
237  else
238  blob_box = box_next (&blob_it);
239  row_length = blob_box.left () - end_of_row;
240  prev_blob_box = blob_box;
241  while (!blob_it.cycled_list ()) {
243  blob_box = box_next_pre_chopped (&blob_it);
244  else if (tosp_stats_use_xht_gaps)
245  blob_box = reduced_box_next (row, &blob_it);
246  else
247  blob_box = box_next (&blob_it);
248  int16_t left = prev_blob_box.right();
249  int16_t right = blob_box.left();
250  gap_width = right - left;
251  if ((gap_width > real_space_threshold) &&
252  !ignore_big_gap(row, row_length, gapmap, left, right)) {
253  /*
254  If tosp_use_cert_spaces is enabled, the estimate of the space gap is
255  restricted to obvious spaces - those wider than half the xht or those
256  with wide blobs on both sides - i.e not things that are suspect 1's or
257  punctuation that is sometimes widely spaced.
258  */
260  (gap_width >
262  ||
263  ((gap_width >
266  || (!narrow_blob (row, prev_blob_box)
267  && !narrow_blob (row, blob_box))))
268  || (wide_blob (row, prev_blob_box)
269  && wide_blob (row, blob_box)))
270  space_gap_stats.add (gap_width, 1);
271  }
272  prev_blob_box = blob_box;
273  }
274  }
275  }
276  //Inadequate samples
277  if (space_gap_stats.get_total () <= 2)
278  block_space_gap_width = -1;//No est. space width
279  else
280  block_space_gap_width =
281  std::max(static_cast<int16_t>(floor(space_gap_stats.median())),
282  static_cast<int16_t>(3 * block_non_space_gap_width));
283  }
284 }
285 
286 
287 /*************************************************************************
288  * row_spacing_stats()
289  * Set values for min_space, max_non_space based on row stats only
290  * If failure - return 0 values.
291  *************************************************************************/
292 void Textord::row_spacing_stats(
293  TO_ROW *row,
294  GAPMAP *gapmap,
295  int16_t block_idx,
296  int16_t row_idx,
297  int16_t block_space_gap_width, //estimate for block
298  int16_t block_non_space_gap_width //estimate for block
299  ) {
300  //iterator
301  BLOBNBOX_IT blob_it = row->blob_list ();
302  STATS all_gap_stats (0, MAXSPACING);
303  STATS cert_space_gap_stats (0, MAXSPACING);
304  STATS all_space_gap_stats (0, MAXSPACING);
305  STATS small_gap_stats (0, MAXSPACING);
306  TBOX blob_box;
307  TBOX prev_blob_box;
308  int16_t gap_width;
309  int16_t real_space_threshold = 0;
310  int16_t max = 0;
311  int16_t index;
312  int16_t large_gap_count = 0;
313  bool suspected_table;
314  int32_t max_max_nonspace; //upper bound
315  bool good_block_space_estimate = block_space_gap_width > 0;
316  int32_t end_of_row;
317  int32_t row_length = 0;
318  float sane_space;
319  int32_t sane_threshold;
320 
321  /* Collect first pass stats for row */
322 
323  if (!good_block_space_estimate)
324  block_space_gap_width = int16_t (floor (row->xheight / 2));
325  if (!row->blob_list ()->empty ()) {
326  if (tosp_threshold_bias1 > 0)
327  real_space_threshold =
328  block_non_space_gap_width +
329  int16_t (floor (0.5 +
330  tosp_threshold_bias1 * (block_space_gap_width -
331  block_non_space_gap_width)));
332  else
333  real_space_threshold = //Old TO method
334  (block_space_gap_width + block_non_space_gap_width) / 2;
335  blob_it.set_to_list (row->blob_list ());
336  blob_it.mark_cycle_pt ();
337  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
339  blob_box = box_next_pre_chopped (&blob_it);
340  else if (tosp_stats_use_xht_gaps)
341  blob_box = reduced_box_next (row, &blob_it);
342  else
343  blob_box = box_next (&blob_it);
344  row_length = end_of_row - blob_box.left ();
345  prev_blob_box = blob_box;
346  while (!blob_it.cycled_list ()) {
348  blob_box = box_next_pre_chopped (&blob_it);
349  else if (tosp_stats_use_xht_gaps)
350  blob_box = reduced_box_next (row, &blob_it);
351  else
352  blob_box = box_next (&blob_it);
353  int16_t left = prev_blob_box.right();
354  int16_t right = blob_box.left();
355  gap_width = right - left;
356  if (ignore_big_gap(row, row_length, gapmap, left, right)) {
357  large_gap_count++;
358  } else {
359  if (gap_width >= real_space_threshold) {
361  (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
362  ((gap_width > tosp_fuzzy_space_factor1 * row->xheight)
364  || (!narrow_blob (row, prev_blob_box)
365  && !narrow_blob (row, blob_box))))
366  || (wide_blob (row, prev_blob_box)
367  && wide_blob (row, blob_box)))
368  cert_space_gap_stats.add (gap_width, 1);
369  all_space_gap_stats.add (gap_width, 1);
370  }
371  else
372  small_gap_stats.add (gap_width, 1);
373  all_gap_stats.add (gap_width, 1);
374  }
375  prev_blob_box = blob_box;
376  }
377  }
378  suspected_table = (large_gap_count > 1) ||
379  ((large_gap_count > 0) &&
380  (all_gap_stats.get_total () <= tosp_few_samples));
381 
382  /* Now determine row kern size, space size and threshold */
383 
384  if ((cert_space_gap_stats.get_total () >=
386  ((suspected_table ||
387  all_gap_stats.get_total () <= tosp_short_row) &&
388  cert_space_gap_stats.get_total () > 0)) {
389  old_to_method(row,
390  &all_gap_stats,
391  &cert_space_gap_stats,
392  &small_gap_stats,
393  block_space_gap_width,
394  block_non_space_gap_width);
395  } else {
397  !isolated_row_stats (row, gapmap, &all_gap_stats, suspected_table,
398  block_idx, row_idx)) {
400  tprintf ("B:%d R:%d -- Inadequate certain spaces.\n",
401  block_idx, row_idx);
402  if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
403  //Use block default
404  row->space_size = block_space_gap_width;
405  if (all_gap_stats.get_total () > tosp_redo_kern_limit)
406  row->kern_size = all_gap_stats.median ();
407  else
408  row->kern_size = block_non_space_gap_width;
409  row->space_threshold =
410  int32_t (floor ((row->space_size + row->kern_size) /
412  }
413  else
414  old_to_method(row,
415  &all_gap_stats,
416  &all_space_gap_stats,
417  &small_gap_stats,
418  block_space_gap_width,
419  block_non_space_gap_width);
420  }
421  }
422 
423  if (tosp_improve_thresh && !suspected_table)
424  improve_row_threshold(row, &all_gap_stats);
425 
426  /* Now lets try to be careful not to do anything silly with tables when we
427  are ignoring big gaps*/
428  if (tosp_sanity_method == 0) {
429  if (suspected_table &&
430  (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
431  if (tosp_debug_level > 5)
432  tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx,
433  row_idx, row->kern_size, row->space_threshold, row->space_size);
434  row->space_threshold =
435  static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size);
436  row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
437  }
438  }
439  else if (tosp_sanity_method == 1) {
440  sane_space = row->space_size;
441  /* NEVER let space size get too close to kern size */
442  if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f))
443  || ((row->space_size - row->kern_size) <
444  (tosp_silly_kn_sp_gap * row->xheight))) {
445  if (good_block_space_estimate &&
446  (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size))
447  sane_space = block_space_gap_width;
448  else
449  sane_space =
450  std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f),
451  row->xheight / 2.0f);
452  if (tosp_debug_level > 5)
453  tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n",
454  block_idx, row_idx, row->kern_size, row->space_threshold,
455  row->space_size, sane_space);
456  row->space_size = sane_space;
457  row->space_threshold =
458  int32_t (floor ((row->space_size + row->kern_size) /
460  }
461  /* NEVER let threshold get VERY far away from kern */
462  sane_threshold = int32_t (floor (tosp_max_sane_kn_thresh *
463  std::max(row->kern_size, 2.5f)));
464  if (row->space_threshold > sane_threshold) {
465  if (tosp_debug_level > 5)
466  tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n",
467  block_idx, row_idx, row->kern_size, row->space_threshold,
468  row->space_size, sane_threshold);
469  row->space_threshold = sane_threshold;
470  if (row->space_size <= sane_threshold)
471  row->space_size = row->space_threshold + 1.0f;
472  }
473  /* Beware of tables - there may be NO spaces */
474  if (suspected_table) {
475  sane_space = std::max(tosp_table_kn_sp_ratio * row->kern_size,
477  sane_threshold = int32_t (floor ((sane_space + row->kern_size) / 2));
478 
479  if ((row->space_size < sane_space) ||
480  (row->space_threshold < sane_threshold)) {
481  if (tosp_debug_level > 5)
482  tprintf ("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n",
483  block_idx, row_idx,
484  row->kern_size,
485  row->space_threshold, row->space_size);
486  //the minimum sane value
487  row->space_threshold = static_cast<int32_t>(sane_space);
488  row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
489  }
490  }
491  }
492 
493  /* Now lets try to put some error limits on the threshold */
494 
495  if (tosp_old_to_method) {
496  /* Old textord made a space if gap >= threshold */
497  //NO FUZZY SPACES YET
498  row->max_nonspace = row->space_threshold;
499  //NO FUZZY SPACES YET
500  row->min_space = row->space_threshold + 1;
501  }
502  else {
503  /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
504  row->min_space =
505  std::min(int32_t (ceil (tosp_fuzzy_space_factor * row->xheight)),
506  int32_t (row->space_size));
507  if (row->min_space <= row->space_threshold)
508  // Don't be silly
509  row->min_space = row->space_threshold + 1;
510  /*
511  Lets try to guess the max certain kern gap by looking at the cluster of
512  kerns for the row. The row is proportional so the kerns should cluster
513  tightly at the bottom of the distribution. We also expect most gaps to be
514  kerns. Find the maximum of the kern piles between 0 and twice the kern
515  estimate. Piles before the first one with less than 1/10 the maximum
516  number of samples can be taken as certain kerns.
517 
518  Of course, there are some cases where the kern peak and space peaks merge,
519  so we will put an UPPER limit on the max certain kern gap of some fraction
520  below the threshold.
521  */
522 
523  max_max_nonspace = int32_t ((row->space_threshold + row->kern_size) / 2);
524 
525  //default
526  row->max_nonspace = max_max_nonspace;
527  for (index = 0; index <= max_max_nonspace; index++) {
528  if (all_gap_stats.pile_count (index) > max)
529  max = all_gap_stats.pile_count (index);
530  if ((index > row->kern_size) &&
531  (all_gap_stats.pile_count (index) < 0.1 * max)) {
532  row->max_nonspace = index;
533  break;
534  }
535  }
536  }
537 
538  /* Yet another algorithm - simpler this time - just choose a fraction of the
539  threshold to space range */
540 
541  if ((tosp_fuzzy_sp_fraction > 0) &&
542  (row->space_size > row->space_threshold))
543  row->min_space = std::max(row->min_space,
544  static_cast<int32_t>(ceil (row->space_threshold +
546  (row->space_size -
547  row->space_threshold))));
548 
549  /* Ensure that ANY space less than some multiplier times the kern size is
550  fuzzy. In tables there is a risk of erroneously setting a small space size
551  when there are no real spaces. Sometimes tables have text squashed into
552  columns so that the kn->sp ratio is small anyway - this means that we can't
553  use this to force a wider separation - hence we rely on context to join any
554  dubious breaks. */
555 
556  if ((tosp_table_fuzzy_kn_sp_ratio > 0) &&
557  (suspected_table || tosp_fuzzy_limit_all))
558  row->min_space = std::max(row->min_space,
559  static_cast<int32_t>(ceil (tosp_table_fuzzy_kn_sp_ratio *
560  row->kern_size)));
561 
562  if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
563  row->max_nonspace = static_cast<int32_t>(floor (0.5 + row->kern_size +
565  (row->space_threshold -
566  row->kern_size)));
567  }
568  if (row->max_nonspace > row->space_threshold) {
569  // Don't be silly
570  row->max_nonspace = row->space_threshold;
571  }
572 
573  if (tosp_debug_level > 5)
574  tprintf
575  ("B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) Sp:%3.2f\n",
576  block_idx, row_idx, row_length, block_non_space_gap_width,
577  block_space_gap_width, real_space_threshold, row->kern_size,
578  row->max_nonspace, row->space_threshold, row->min_space,
579  row->space_size);
580  if (tosp_debug_level > 10)
581  tprintf("row->kern_size = %3.2f, row->space_size = %3.2f, "
582  "row->space_threshold = %d\n",
583  row->kern_size, row->space_size, row->space_threshold);
584 }
585 
586 void Textord::old_to_method(
587  TO_ROW *row,
588  STATS *all_gap_stats,
589  STATS *space_gap_stats,
590  STATS *small_gap_stats,
591  int16_t block_space_gap_width, //estimate for block
592  int16_t block_non_space_gap_width //estimate for block
593  ) {
594  /* First, estimate row space size */
595  /* Old to condition was > 2 */
596  if (space_gap_stats->get_total () >= tosp_enough_space_samples_for_median) {
597  //Adequate samples
598  /* Set space size to median of spaces BUT limits it if it seems wildly out */
599  row->space_size = space_gap_stats->median ();
600  if (row->space_size > block_space_gap_width * 1.5) {
602  row->space_size = block_space_gap_width * 1.5;
603  else
604  //BUG??? should be *1.5
605  row->space_size = block_space_gap_width;
606  }
607  if (row->space_size < (block_non_space_gap_width * 2) + 1)
608  row->space_size = (block_non_space_gap_width * 2) + 1;
609  }
610  //Only 1 or 2 samples
611  else if (space_gap_stats->get_total () >= 1) {
612  //hence mean not median
613  row->space_size = space_gap_stats->mean ();
614  if (row->space_size > block_space_gap_width * 1.5) {
616  row->space_size = block_space_gap_width * 1.5;
617  else
618  //BUG??? should be *1.5
619  row->space_size = block_space_gap_width;
620  }
621  if (row->space_size < (block_non_space_gap_width * 3) + 1)
622  row->space_size = (block_non_space_gap_width * 3) + 1;
623  }
624  else {
625  //Use block default
626  row->space_size = block_space_gap_width;
627  }
628 
629  /* Next, estimate row kern size */
631  (small_gap_stats->get_total () > tosp_redo_kern_limit))
632  row->kern_size = small_gap_stats->median ();
633  else if (all_gap_stats->get_total () > tosp_redo_kern_limit)
634  row->kern_size = all_gap_stats->median ();
635  else //old TO -SAME FOR ALL ROWS
636  row->kern_size = block_non_space_gap_width;
637 
638  /* Finally, estimate row space threshold */
639  if (tosp_threshold_bias2 > 0) {
640  row->space_threshold =
641  int32_t (floor (0.5 + row->kern_size +
643  row->kern_size)));
644  } else {
645  /*
646  NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold
647  and holds this in a float. The use is with a >= test
648  NEW textord uses an integer threshold and a > test
649  It comes to the same thing.
650  (Though there is a difference in that old textor has integer space_size
651  and kern_size.)
652  */
653  row->space_threshold =
654  int32_t (floor ((row->space_size + row->kern_size) / 2));
655  }
656 
657  // Apply the same logic and ratios as in row_spacing_stats to
658  // restrict relative values of the row's space_size, kern_size, and
659  // space_threshold
661  ((row->space_size <
662  tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
663  ((row->space_size - row->kern_size) <
664  tosp_silly_kn_sp_gap * row->xheight))) {
665  if (row->kern_size > 2.5)
667  row->space_threshold = int32_t (floor ((row->space_size + row->kern_size) /
669  }
670 }
671 
672 
673 /*************************************************************************
674  * isolated_row_stats()
675  * Set values for min_space, max_non_space based on row stats only
676  *************************************************************************/
677 bool Textord::isolated_row_stats(TO_ROW* row,
678  GAPMAP* gapmap,
679  STATS* all_gap_stats,
680  bool suspected_table,
681  int16_t block_idx,
682  int16_t row_idx) {
683  float kern_estimate;
684  float crude_threshold_estimate;
685  int16_t small_gaps_count;
686  int16_t total;
687  //iterator
688  BLOBNBOX_IT blob_it = row->blob_list ();
689  STATS cert_space_gap_stats (0, MAXSPACING);
690  STATS all_space_gap_stats (0, MAXSPACING);
691  STATS small_gap_stats (0, MAXSPACING);
692  TBOX blob_box;
693  TBOX prev_blob_box;
694  int16_t gap_width;
695  int32_t end_of_row;
696  int32_t row_length;
697 
698  kern_estimate = all_gap_stats->median ();
699  crude_threshold_estimate = std::max(tosp_init_guess_kn_mult * kern_estimate,
701  small_gaps_count = stats_count_under (all_gap_stats,
702  static_cast<int16_t>(ceil (crude_threshold_estimate)));
703  total = all_gap_stats->get_total ();
704 
705  if ((total <= tosp_redo_kern_limit) ||
706  ((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) ||
707  (total - small_gaps_count < 1)) {
708  if (tosp_debug_level > 5)
709  tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx,
710  row_idx);
711  return false;
712  }
713  blob_it.set_to_list (row->blob_list ());
714  blob_it.mark_cycle_pt ();
715  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
717  blob_box = box_next_pre_chopped (&blob_it);
718  else if (tosp_stats_use_xht_gaps)
719  blob_box = reduced_box_next (row, &blob_it);
720  else
721  blob_box = box_next (&blob_it);
722  row_length = end_of_row - blob_box.left ();
723  prev_blob_box = blob_box;
724  while (!blob_it.cycled_list ()) {
726  blob_box = box_next_pre_chopped (&blob_it);
727  else if (tosp_stats_use_xht_gaps)
728  blob_box = reduced_box_next (row, &blob_it);
729  else
730  blob_box = box_next (&blob_it);
731  int16_t left = prev_blob_box.right();
732  int16_t right = blob_box.left();
733  gap_width = right - left;
734  if (!ignore_big_gap(row, row_length, gapmap, left, right) &&
735  (gap_width > crude_threshold_estimate)) {
736  if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
737  ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
739  (!narrow_blob (row, prev_blob_box) &&
740  !narrow_blob (row, blob_box)))) ||
741  (wide_blob (row, prev_blob_box) && wide_blob (row, blob_box)))
742  cert_space_gap_stats.add (gap_width, 1);
743  all_space_gap_stats.add (gap_width, 1);
744  }
745  if (gap_width < crude_threshold_estimate)
746  small_gap_stats.add (gap_width, 1);
747 
748  prev_blob_box = blob_box;
749  }
750  if (cert_space_gap_stats.get_total () >=
752  //median
753  row->space_size = cert_space_gap_stats.median ();
754  else if (suspected_table && (cert_space_gap_stats.get_total () > 0))
755  //to avoid spaced
756  row->space_size = cert_space_gap_stats.mean ();
757  // 1's in tables
758  else if (all_space_gap_stats.get_total () >=
760  //median
761  row->space_size = all_space_gap_stats.median ();
762  else
763  row->space_size = all_space_gap_stats.mean ();
764 
766  row->kern_size = small_gap_stats.median ();
767  else
768  row->kern_size = all_gap_stats->median ();
769  row->space_threshold =
770  int32_t (floor ((row->space_size + row->kern_size) / 2));
771  /* Sanity check */
772  if ((row->kern_size >= row->space_threshold) ||
773  (row->space_threshold >= row->space_size) ||
774  (row->space_threshold <= 0)) {
775  if (tosp_debug_level > 5)
776  tprintf ("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n",
777  block_idx, row_idx,
778  row->kern_size, row->space_threshold, row->space_size);
779  row->kern_size = 0.0f;
780  row->space_threshold = 0;
781  row->space_size = 0.0f;
782  return false;
783  }
784 
785  if (tosp_debug_level > 5)
786  tprintf ("B:%d R:%d -- Isolated row stats: %f %d %f\n",
787  block_idx, row_idx,
788  row->kern_size, row->space_threshold, row->space_size);
789  return true;
790 }
791 
792 int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) {
793  int16_t index;
794  int16_t total = 0;
795 
796  for (index = 0; index < threshold; index++)
797  total += stats->pile_count (index);
798  return total;
799 }
800 
801 
802 /*************************************************************************
803  * improve_row_threshold()
804  * Try to recognise a "normal line" -
805  * > 25 gaps
806  * && space > 3 * kn && space > 10
807  * (I.e. reasonably large space and kn:sp ratio)
808  * && > 3/4 # gaps < kn + (sp - kn)/3
809  * (I.e. most gaps are well away from space estimate)
810  * && a gap of max(3, (sp - kn) / 3) empty histogram positions is found
811  * somewhere in the histogram between kn and sp
812  * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
813  * NO!!!!! the bristol line has "11" with a gap of 12 between the 1's!!!
814  * try moving the default threshold to within this band but leave the
815  * fuzzy limit calculation as at present.
816  *************************************************************************/
817 void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
818  float sp = row->space_size;
819  float kn = row->kern_size;
820  int16_t reqd_zero_width = 0;
821  int16_t zero_width = 0;
822  int16_t zero_start = 0;
823  int16_t index = 0;
824 
825  if (tosp_debug_level > 10)
826  tprintf ("Improve row threshold 0");
827  if ((all_gap_stats->get_total () <= 25) ||
828  (sp <= 10) ||
829  (sp <= 3 * kn) ||
830  (stats_count_under (all_gap_stats,
831  static_cast<int16_t>(ceil (kn + (sp - kn) / 3 + 0.5))) <
832  (0.75 * all_gap_stats->get_total ())))
833  return;
834  if (tosp_debug_level > 10)
835  tprintf (" 1");
836  /*
837  Look for the first region of all 0's in the histogram which is wider than
838  max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current
839  threshold is not within it, move the threshold so that is is just inside it.
840  */
841  reqd_zero_width = static_cast<int16_t>(floor ((sp - kn) / 3 + 0.5));
842  if (reqd_zero_width < 3)
843  reqd_zero_width = 3;
844 
845  for (index = int16_t (ceil (kn)); index < int16_t (floor (sp)); index++) {
846  if (all_gap_stats->pile_count (index) == 0) {
847  if (zero_width == 0)
848  zero_start = index;
849  zero_width++;
850  }
851  else {
852  if (zero_width >= reqd_zero_width)
853  break;
854  else {
855  zero_width = 0;
856  }
857  }
858  }
859  index--;
860  if (tosp_debug_level > 10)
861  tprintf (" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n",
862  reqd_zero_width, zero_width, zero_start, row->space_threshold);
863  if ((zero_width < reqd_zero_width) ||
864  ((row->space_threshold >= zero_start) &&
865  (row->space_threshold <= index)))
866  return;
867  if (tosp_debug_level > 10)
868  tprintf (" 2");
869  if (row->space_threshold < zero_start) {
870  if (tosp_debug_level > 5)
871  tprintf
872  ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
873  kn, sp, zero_start, index, row->space_threshold, zero_start);
874  row->space_threshold = zero_start;
875  }
876  if (row->space_threshold > index) {
877  if (tosp_debug_level > 5)
878  tprintf
879  ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
880  kn, sp, zero_start, index, row->space_threshold, index);
881  row->space_threshold = index;
882  }
883 }
884 
885 
886 /**********************************************************************
887  * make_prop_words
888  *
889  * Convert a TO_ROW to a ROW.
890  **********************************************************************/
892  TO_ROW *row, // row to make
893  FCOORD rotation // for drawing
894  ) {
895  bool bol; // start of line
896  /* prev_ values are for start of word being built. non prev_ values are for
897  the gap between the word being built and the next one. */
898  bool prev_fuzzy_sp; // probably space
899  bool prev_fuzzy_non; // probably not
900  uint8_t prev_blanks; // in front of word
901  bool fuzzy_sp = false; // probably space
902  bool fuzzy_non = false; // probably not
903  uint8_t blanks = 0; // in front of word
904  bool prev_gap_was_a_space = false;
905  bool break_at_next_gap = false;
906  ROW *real_row; // output row
907  C_OUTLINE_IT cout_it;
908  C_BLOB_LIST cblobs;
909  C_BLOB_IT cblob_it = &cblobs;
910  WERD_LIST words;
911  WERD *word; // new word
912  int32_t next_rep_char_word_right = INT32_MAX;
913  float repetition_spacing; // gap between repetitions
914  int32_t xstarts[2]; // row ends
915  int32_t prev_x; // end of prev blob
916  BLOBNBOX *bblob; // current blob
917  TBOX blob_box; // bounding box
918  BLOBNBOX_IT box_it; // iterator
919  TBOX prev_blob_box;
920  TBOX next_blob_box;
921  int16_t prev_gap = INT16_MAX;
922  int16_t current_gap = INT16_MAX;
923  int16_t next_gap = INT16_MAX;
924  int16_t prev_within_xht_gap = INT16_MAX;
925  int16_t current_within_xht_gap = INT16_MAX;
926  int16_t next_within_xht_gap = INT16_MAX;
927  int16_t word_count = 0;
928 
929  // repeated char words
930  WERD_IT rep_char_it(&(row->rep_words));
931  if (!rep_char_it.empty ()) {
932  next_rep_char_word_right =
933  rep_char_it.data ()->bounding_box ().right ();
934  }
935 
936  prev_x = -INT16_MAX;
937  cblob_it.set_to_list (&cblobs);
938  box_it.set_to_list (row->blob_list ());
939  // new words
940  WERD_IT word_it(&words);
941  bol = true;
942  prev_blanks = 0;
943  prev_fuzzy_sp = false;
944  prev_fuzzy_non = false;
945  if (!box_it.empty ()) {
946  xstarts[0] = box_it.data ()->bounding_box ().left ();
947  if (xstarts[0] > next_rep_char_word_right) {
948  /* We need to insert a repeated char word at the start of the row */
949  word = rep_char_it.extract ();
950  word_it.add_after_then_move (word);
951  /* Set spaces before repeated char word */
952  word->set_flag (W_BOL, true);
953  bol = false;
954  word->set_blanks (0);
955  //NO uncertainty
956  word->set_flag (W_FUZZY_SP, false);
957  word->set_flag (W_FUZZY_NON, false);
958  xstarts[0] = word->bounding_box ().left ();
959  /* Set spaces after repeated char word (and leave current word set) */
960  repetition_spacing = find_mean_blob_spacing (word);
961  current_gap = box_it.data ()->bounding_box ().left () -
962  next_rep_char_word_right;
963  current_within_xht_gap = current_gap;
964  if (current_gap > tosp_rep_space * repetition_spacing) {
965  prev_blanks = static_cast<uint8_t>(floor (current_gap / row->space_size));
966  if (prev_blanks < 1)
967  prev_blanks = 1;
968  }
969  else
970  prev_blanks = 0;
971  if (tosp_debug_level > 5)
972  tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
973  box_it.data ()->bounding_box ().left (),
974  box_it.data ()->bounding_box ().bottom (),
975  repetition_spacing, current_gap);
976  prev_fuzzy_sp = false;
977  prev_fuzzy_non = false;
978  if (rep_char_it.empty ()) {
979  next_rep_char_word_right = INT32_MAX;
980  }
981  else {
982  rep_char_it.forward ();
983  next_rep_char_word_right =
984  rep_char_it.data ()->bounding_box ().right ();
985  }
986  }
987 
988  peek_at_next_gap(row,
989  box_it,
990  next_blob_box,
991  next_gap,
992  next_within_xht_gap);
993  do {
994  bblob = box_it.data ();
995  blob_box = bblob->bounding_box ();
996  if (bblob->joined_to_prev ()) {
997  if (bblob->cblob () != nullptr) {
998  cout_it.set_to_list (cblob_it.data ()->out_list ());
999  cout_it.move_to_last ();
1000  cout_it.add_list_after (bblob->cblob ()->out_list ());
1001  delete bblob->cblob ();
1002  }
1003  } else {
1004  if (bblob->cblob() != nullptr)
1005  cblob_it.add_after_then_move (bblob->cblob ());
1006  prev_x = blob_box.right ();
1007  }
1008  box_it.forward (); //next one
1009  bblob = box_it.data ();
1010  blob_box = bblob->bounding_box ();
1011 
1012  if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) {
1013  /* Real Blob - not multiple outlines or pre-chopped */
1014  prev_gap = current_gap;
1015  prev_within_xht_gap = current_within_xht_gap;
1016  prev_blob_box = next_blob_box;
1017  current_gap = next_gap;
1018  current_within_xht_gap = next_within_xht_gap;
1019  peek_at_next_gap(row,
1020  box_it,
1021  next_blob_box,
1022  next_gap,
1023  next_within_xht_gap);
1024 
1025  int16_t prev_gap_arg = prev_gap;
1026  int16_t next_gap_arg = next_gap;
1027  if (tosp_only_use_xht_gaps) {
1028  prev_gap_arg = prev_within_xht_gap;
1029  next_gap_arg = next_within_xht_gap;
1030  }
1031  // Decide if a word-break should be inserted
1032  if (blob_box.left () > next_rep_char_word_right ||
1033  make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box,
1034  current_gap, current_within_xht_gap,
1035  next_blob_box, next_gap_arg,
1036  blanks, fuzzy_sp, fuzzy_non,
1037  prev_gap_was_a_space,
1038  break_at_next_gap) ||
1039  box_it.at_first()) {
1040  /* Form a new word out of the blobs collected */
1041  word = new WERD (&cblobs, prev_blanks, nullptr);
1042  word_count++;
1043  word_it.add_after_then_move (word);
1044  if (bol) {
1045  word->set_flag (W_BOL, true);
1046  bol = false;
1047  }
1048  if (prev_fuzzy_sp)
1049  //probably space
1050  word->set_flag (W_FUZZY_SP, true);
1051  else if (prev_fuzzy_non)
1052  word->set_flag (W_FUZZY_NON, true);
1053  //probably not
1054 
1055  if (blob_box.left () > next_rep_char_word_right) {
1056  /* We need to insert a repeated char word */
1057  word = rep_char_it.extract ();
1058  word_it.add_after_then_move (word);
1059 
1060  /* Set spaces before repeated char word */
1061  repetition_spacing = find_mean_blob_spacing (word);
1062  current_gap = word->bounding_box ().left () - prev_x;
1063  current_within_xht_gap = current_gap;
1064  if (current_gap > tosp_rep_space * repetition_spacing) {
1065  blanks =
1066  static_cast<uint8_t>(floor (current_gap / row->space_size));
1067  if (blanks < 1)
1068  blanks = 1;
1069  }
1070  else
1071  blanks = 0;
1072  if (tosp_debug_level > 5)
1073  tprintf
1074  ("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
1075  word->bounding_box ().left (),
1076  word->bounding_box ().bottom (),
1077  repetition_spacing, current_gap, blanks);
1078  word->set_blanks (blanks);
1079  //NO uncertainty
1080  word->set_flag (W_FUZZY_SP, false);
1081  word->set_flag (W_FUZZY_NON, false);
1082 
1083  /* Set spaces after repeated char word (and leave current word set) */
1084  current_gap =
1085  blob_box.left () - next_rep_char_word_right;
1086  if (current_gap > tosp_rep_space * repetition_spacing) {
1087  blanks = static_cast<uint8_t>(current_gap / row->space_size);
1088  if (blanks < 1)
1089  blanks = 1;
1090  }
1091  else
1092  blanks = 0;
1093  if (tosp_debug_level > 5)
1094  tprintf (" Rgap:%d (%d blanks)\n",
1095  current_gap, blanks);
1096  fuzzy_sp = false;
1097  fuzzy_non = false;
1098 
1099  if (rep_char_it.empty ()) {
1100  next_rep_char_word_right = INT32_MAX;
1101  }
1102  else {
1103  rep_char_it.forward ();
1104  next_rep_char_word_right =
1105  rep_char_it.data ()->bounding_box ().right ();
1106  }
1107  }
1108 
1109  if (box_it.at_first () && rep_char_it.empty ()) {
1110  //at end of line
1111  word->set_flag (W_EOL, true);
1112  xstarts[1] = prev_x;
1113  }
1114  else {
1115  prev_blanks = blanks;
1116  prev_fuzzy_sp = fuzzy_sp;
1117  prev_fuzzy_non = fuzzy_non;
1118  }
1119  }
1120  }
1121  }
1122  while (!box_it.at_first ()); //until back at start
1123 
1124  /* Insert any further repeated char words */
1125  while (!rep_char_it.empty ()) {
1126  word = rep_char_it.extract ();
1127  word_it.add_after_then_move (word);
1128 
1129  /* Set spaces before repeated char word */
1130  repetition_spacing = find_mean_blob_spacing (word);
1131  current_gap = word->bounding_box ().left () - prev_x;
1132  if (current_gap > tosp_rep_space * repetition_spacing) {
1133  blanks = static_cast<uint8_t>(floor (current_gap / row->space_size));
1134  if (blanks < 1)
1135  blanks = 1;
1136  }
1137  else
1138  blanks = 0;
1139  if (tosp_debug_level > 5)
1140  tprintf(
1141  "Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n",
1142  word->bounding_box().left(), word->bounding_box().bottom(),
1143  repetition_spacing, current_gap, blanks);
1144  word->set_blanks (blanks);
1145  //NO uncertainty
1146  word->set_flag (W_FUZZY_SP, false);
1147  word->set_flag (W_FUZZY_NON, false);
1148  prev_x = word->bounding_box ().right ();
1149  if (rep_char_it.empty ()) {
1150  //at end of line
1151  word->set_flag (W_EOL, true);
1152  xstarts[1] = prev_x;
1153  }
1154  else {
1155  rep_char_it.forward ();
1156  }
1157  }
1158  real_row = new ROW (row,
1159  static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1160  word_it.set_to_list (real_row->word_list ());
1161  //put words in row
1162  word_it.add_list_after (&words);
1163  real_row->recalc_bounding_box ();
1164 
1165  if (tosp_debug_level > 4) {
1166  tprintf ("Row: Made %d words in row ((%d,%d)(%d,%d))\n",
1167  word_count,
1168  real_row->bounding_box ().left (),
1169  real_row->bounding_box ().bottom (),
1170  real_row->bounding_box ().right (),
1171  real_row->bounding_box ().top ());
1172  }
1173  return real_row;
1174  }
1175  return nullptr;
1176 }
1178 /**********************************************************************
1179  * make_blob_words
1180  *
1181  * Converts words into blobs so that each blob is a single character.
1182  * Used for chopper test.
1183  **********************************************************************/
1185  TO_ROW *row, // row to make
1186  FCOORD rotation // for drawing
1187  ) {
1188  bool bol; // start of line
1189  ROW *real_row; // output row
1190  C_OUTLINE_IT cout_it;
1191  C_BLOB_LIST cblobs;
1192  C_BLOB_IT cblob_it = &cblobs;
1193  WERD_LIST words;
1194  WERD *word; // new word
1195  BLOBNBOX *bblob; // current blob
1196  TBOX blob_box; // bounding box
1197  BLOBNBOX_IT box_it; // iterator
1198  int16_t word_count = 0;
1199 
1200  cblob_it.set_to_list(&cblobs);
1201  box_it.set_to_list(row->blob_list());
1202  // new words
1203  WERD_IT word_it(&words);
1204  bol = true;
1205  if (!box_it.empty()) {
1206 
1207  do {
1208  bblob = box_it.data();
1209  blob_box = bblob->bounding_box();
1210  if (bblob->joined_to_prev()) {
1211  if (bblob->cblob() != nullptr) {
1212  cout_it.set_to_list(cblob_it.data()->out_list());
1213  cout_it.move_to_last();
1214  cout_it.add_list_after(bblob->cblob()->out_list());
1215  delete bblob->cblob();
1216  }
1217  } else {
1218  if (bblob->cblob() != nullptr)
1219  cblob_it.add_after_then_move(bblob->cblob());
1220  }
1221  box_it.forward(); // next one
1222  bblob = box_it.data();
1223  blob_box = bblob->bounding_box();
1224 
1225  if (!bblob->joined_to_prev() && !cblobs.empty()) {
1226  word = new WERD(&cblobs, 1, nullptr);
1227  word_count++;
1228  word_it.add_after_then_move(word);
1229  if (bol) {
1230  word->set_flag(W_BOL, true);
1231  bol = false;
1232  }
1233  if (box_it.at_first()) { // at end of line
1234  word->set_flag(W_EOL, true);
1235  }
1236  }
1237  }
1238  while (!box_it.at_first()); // until back at start
1239  /* Setup the row with created words. */
1240  real_row = new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1241  word_it.set_to_list(real_row->word_list());
1242  //put words in row
1243  word_it.add_list_after(&words);
1244  real_row->recalc_bounding_box();
1245  if (tosp_debug_level > 4) {
1246  tprintf ("Row:Made %d words in row ((%d,%d)(%d,%d))\n",
1247  word_count,
1248  real_row->bounding_box().left(),
1249  real_row->bounding_box().bottom(),
1250  real_row->bounding_box().right(),
1251  real_row->bounding_box().top());
1252  }
1253  return real_row;
1254  }
1255  return nullptr;
1256 }
1257 
1258 bool Textord::make_a_word_break(
1259  TO_ROW* row, // row being made
1260  TBOX blob_box, // for next_blob // how many blanks?
1261  int16_t prev_gap,
1262  TBOX prev_blob_box,
1263  int16_t real_current_gap,
1264  int16_t within_xht_current_gap,
1265  TBOX next_blob_box,
1266  int16_t next_gap,
1267  uint8_t& blanks,
1268  bool& fuzzy_sp,
1269  bool& fuzzy_non,
1270  bool& prev_gap_was_a_space,
1271  bool& break_at_next_gap) {
1272  bool space;
1273  int16_t current_gap;
1274  float fuzzy_sp_to_kn_limit;
1275 
1276  if (break_at_next_gap) {
1277  break_at_next_gap = false;
1278  return true;
1279  }
1280  /* Inhibit using the reduced gap if
1281  The kerning is large - chars are not kerned and reducing "f"s can cause
1282  erroneous blanks
1283  OR The real gap is less than 0
1284  OR The real gap is less than the kerning estimate
1285  */
1286  if ((row->kern_size > tosp_large_kerning * row->xheight) ||
1288  (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size)))
1289  //Ignore the difference
1290  within_xht_current_gap = real_current_gap;
1291 
1293  current_gap = within_xht_current_gap;
1294  else
1295  current_gap = real_current_gap;
1296 
1297  if (tosp_old_to_method) {
1298  //Boring old method
1299  space = current_gap > row->max_nonspace;
1300  if (space && (current_gap < INT16_MAX)) {
1301  if (current_gap < row->min_space) {
1302  if (current_gap > row->space_threshold) {
1303  blanks = 1;
1304  fuzzy_sp = true;
1305  fuzzy_non = false;
1306  }
1307  else {
1308  blanks = 0;
1309  fuzzy_sp = false;
1310  fuzzy_non = true;
1311  }
1312  }
1313  else {
1314  blanks = static_cast<uint8_t>(current_gap / row->space_size);
1315  if (blanks < 1)
1316  blanks = 1;
1317  fuzzy_sp = false;
1318  fuzzy_non = false;
1319  }
1320  }
1321  return space;
1322  }
1323  else {
1324  /* New exciting heuristic method */
1325  if (prev_blob_box.null_box ()) // Beginning of row
1326  prev_gap_was_a_space = true;
1327 
1328  //Default as old TO
1329  space = current_gap > row->space_threshold;
1330 
1331  /* Set defaults for the word break in case we find one. Currently there are
1332  no fuzzy spaces. Depending on the reliability of the different heuristics
1333  we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
1334  be used if the function returns true - ie the word is to be broken.
1335  */
1336  int num_blanks = current_gap;
1337  if (row->space_size > 1.0f)
1338  num_blanks = IntCastRounded(current_gap / row->space_size);
1339  blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX));
1340  fuzzy_sp = false;
1341  fuzzy_non = false;
1342  /*
1343  If xht measure causes gap to flip one of the 3 thresholds act accordingly -
1344  despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
1345  context.
1346  */
1347  if (tosp_use_xht_gaps &&
1348  (real_current_gap <= row->max_nonspace) &&
1349  (within_xht_current_gap > row->max_nonspace)) {
1350  space = true;
1351  fuzzy_non = true;
1352 #ifndef GRAPHICS_DISABLED
1353  mark_gap (blob_box, 20,
1354  prev_gap, prev_blob_box.width (),
1355  current_gap, next_blob_box.width (), next_gap);
1356 #endif
1357  }
1358  else if (tosp_use_xht_gaps &&
1359  (real_current_gap <= row->space_threshold) &&
1360  (within_xht_current_gap > row->space_threshold)) {
1361  space = true;
1363  fuzzy_sp = true;
1364  else
1365  fuzzy_non = true;
1366 #ifndef GRAPHICS_DISABLED
1367  mark_gap (blob_box, 21,
1368  prev_gap, prev_blob_box.width (),
1369  current_gap, next_blob_box.width (), next_gap);
1370 #endif
1371  }
1372  else if (tosp_use_xht_gaps &&
1373  (real_current_gap < row->min_space) &&
1374  (within_xht_current_gap >= row->min_space)) {
1375  space = true;
1376 #ifndef GRAPHICS_DISABLED
1377  mark_gap (blob_box, 22,
1378  prev_gap, prev_blob_box.width (),
1379  current_gap, next_blob_box.width (), next_gap);
1380 #endif
1381  }
1382  else if (tosp_force_wordbreak_on_punct &&
1383  !suspected_punct_blob(row, prev_blob_box) &&
1384  suspected_punct_blob(row, blob_box)) {
1385  break_at_next_gap = true;
1386  }
1387  /* Now continue with normal heuristics */
1388  else if ((current_gap < row->min_space) &&
1389  (current_gap > row->space_threshold)) {
1390  /* Heuristics to turn dubious spaces to kerns */
1392  fuzzy_sp_to_kn_limit = row->kern_size +
1394  (row->space_size - row->kern_size);
1395  else
1396  fuzzy_sp_to_kn_limit = 99999.0f;
1397 
1398  /* If current gap is significantly smaller than the previous space the other
1399  side of a narrow blob then this gap is a kern. */
1400  if ((prev_blob_box.width () > 0) &&
1401  narrow_blob (row, prev_blob_box) &&
1402  prev_gap_was_a_space &&
1403  (current_gap <= tosp_gap_factor * prev_gap)) {
1404  if ((tosp_all_flips_fuzzy) ||
1405  (current_gap > fuzzy_sp_to_kn_limit)) {
1407  fuzzy_non = true;
1408  else
1409  fuzzy_sp = true;
1410  }
1411  else
1412  space = false;
1413 #ifndef GRAPHICS_DISABLED
1414  mark_gap (blob_box, 1,
1415  prev_gap, prev_blob_box.width (),
1416  current_gap, next_blob_box.width (), next_gap);
1417 #endif
1418  }
1419  /* If current gap not much bigger than the previous kern the other side of a
1420  narrow blob then this gap is a kern as well */
1421  else if ((prev_blob_box.width () > 0) &&
1422  narrow_blob (row, prev_blob_box) &&
1423  !prev_gap_was_a_space &&
1424  (current_gap * tosp_gap_factor <= prev_gap)) {
1425  if ((tosp_all_flips_fuzzy) ||
1426  (current_gap > fuzzy_sp_to_kn_limit)) {
1428  fuzzy_non = true;
1429  else
1430  fuzzy_sp = true;
1431  }
1432  else
1433  space = false;
1434 #ifndef GRAPHICS_DISABLED
1435  mark_gap (blob_box, 2,
1436  prev_gap, prev_blob_box.width (),
1437  current_gap, next_blob_box.width (), next_gap);
1438 #endif
1439  }
1440  else if ((next_blob_box.width () > 0) &&
1441  narrow_blob (row, next_blob_box) &&
1442  (next_gap > row->space_threshold) &&
1443  (current_gap <= tosp_gap_factor * next_gap)) {
1444  if ((tosp_all_flips_fuzzy) ||
1445  (current_gap > fuzzy_sp_to_kn_limit)) {
1447  fuzzy_non = true;
1448  else
1449  fuzzy_sp = true;
1450  }
1451  else
1452  space = false;
1453 #ifndef GRAPHICS_DISABLED
1454  mark_gap (blob_box, 3,
1455  prev_gap, prev_blob_box.width (),
1456  current_gap, next_blob_box.width (), next_gap);
1457 #endif
1458  }
1459  else if ((next_blob_box.width () > 0) &&
1460  narrow_blob (row, next_blob_box) &&
1461  (next_gap <= row->space_threshold) &&
1462  (current_gap * tosp_gap_factor <= next_gap)) {
1463  if ((tosp_all_flips_fuzzy) ||
1464  (current_gap > fuzzy_sp_to_kn_limit)) {
1466  fuzzy_non = true;
1467  else
1468  fuzzy_sp = true;
1469  }
1470  else
1471  space = false;
1472 #ifndef GRAPHICS_DISABLED
1473  mark_gap (blob_box, 4,
1474  prev_gap, prev_blob_box.width (),
1475  current_gap, next_blob_box.width (), next_gap);
1476 #endif
1477  }
1478  else if ((((next_blob_box.width () > 0) &&
1479  narrow_blob (row, next_blob_box)) ||
1480  ((prev_blob_box.width () > 0) &&
1481  narrow_blob (row, prev_blob_box)))) {
1482  fuzzy_sp = true;
1483 #ifndef GRAPHICS_DISABLED
1484  mark_gap (blob_box, 6,
1485  prev_gap, prev_blob_box.width (),
1486  current_gap, next_blob_box.width (), next_gap);
1487 #endif
1488  }
1489  }
1490  else if ((current_gap > row->max_nonspace) &&
1491  (current_gap <= row->space_threshold)) {
1492 
1493  /* Heuristics to turn dubious kerns to spaces */
1494  /* TRIED THIS BUT IT MADE THINGS WORSE
1495  if (prev_gap == INT16_MAX)
1496  prev_gap = 0; // start of row
1497  if (next_gap == INT16_MAX)
1498  next_gap = 0; // end of row
1499  */
1500  if ((prev_blob_box.width () > 0) &&
1501  (next_blob_box.width () > 0) &&
1502  (current_gap >=
1503  tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) &&
1504  wide_blob (row, prev_blob_box) &&
1505  wide_blob (row, next_blob_box)) {
1506 
1507  space = true;
1508  /*
1509  tosp_flip_caution is an attempt to stop the default changing in cases
1510  where there is a large difference between the kern and space estimates.
1511  See problem in 'chiefs' where "have" gets split in the quotation.
1512  */
1513  if ((tosp_flip_fuzz_kn_to_sp) &&
1514  ((tosp_flip_caution <= 0) ||
1515  (tosp_flip_caution * row->kern_size > row->space_size)))
1516  fuzzy_sp = true;
1517  else
1518  fuzzy_non = true;
1519 #ifndef GRAPHICS_DISABLED
1520  mark_gap (blob_box, 7,
1521  prev_gap, prev_blob_box.width (),
1522  current_gap, next_blob_box.width (), next_gap);
1523 #endif
1524  } else if (prev_blob_box.width() > 0 &&
1525  next_blob_box.width() > 0 &&
1526  current_gap > 5 && // Rule 9 handles small gap, big ratio.
1527  current_gap >=
1528  tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) &&
1529  !(narrow_blob(row, prev_blob_box) ||
1530  suspected_punct_blob(row, prev_blob_box)) &&
1531  !(narrow_blob(row, next_blob_box) ||
1532  suspected_punct_blob(row, next_blob_box))) {
1533  space = true;
1534  fuzzy_non = true;
1535 #ifndef GRAPHICS_DISABLED
1536  mark_gap (blob_box, 8,
1537  prev_gap, prev_blob_box.width (),
1538  current_gap, next_blob_box.width (), next_gap);
1539 #endif
1540  }
1541  else if ((tosp_kern_gap_factor3 > 0) &&
1542  (prev_blob_box.width () > 0) &&
1543  (next_blob_box.width () > 0) &&
1544  (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) &&
1546  (!suspected_punct_blob (row, prev_blob_box) &&
1547  !suspected_punct_blob (row, next_blob_box)))) {
1548  space = true;
1549  fuzzy_non = true;
1550 #ifndef GRAPHICS_DISABLED
1551  mark_gap (blob_box, 9,
1552  prev_gap, prev_blob_box.width (),
1553  current_gap, next_blob_box.width (), next_gap);
1554 #endif
1555  }
1556  }
1557  if (tosp_debug_level > 10)
1558  tprintf("word break = %d current_gap = %d, prev_gap = %d, "
1559  "next_gap = %d\n", space ? 1 : 0, current_gap,
1560  prev_gap, next_gap);
1561  prev_gap_was_a_space = space && !(fuzzy_non);
1562  return space;
1563  }
1564 }
1565 
1566 bool Textord::narrow_blob(TO_ROW* row, TBOX blob_box) {
1567  bool result;
1568  result = ((blob_box.width () <= tosp_narrow_fraction * row->xheight) ||
1569  ((static_cast<float>(blob_box.width ()) / blob_box.height ()) <=
1571  return result;
1572 }
1573 
1574 bool Textord::wide_blob(TO_ROW* row, TBOX blob_box) {
1575  bool result;
1576  if (tosp_wide_fraction > 0) {
1577  if (tosp_wide_aspect_ratio > 0)
1578  result = ((blob_box.width () >= tosp_wide_fraction * row->xheight) &&
1579  ((static_cast<float>(blob_box.width ()) / blob_box.height ()) >
1581  else
1582  result = (blob_box.width () >= tosp_wide_fraction * row->xheight);
1583  }
1584  else
1585  result = !narrow_blob (row, blob_box);
1586  return result;
1587 }
1588 
1589 bool Textord::suspected_punct_blob(TO_ROW* row, TBOX box) {
1590  bool result;
1591  float baseline;
1592  float blob_x_centre;
1593  /* Find baseline of centre of blob */
1594  blob_x_centre = (box.right () + box.left ()) / 2.0;
1595  baseline = row->baseline.y (blob_x_centre);
1596 
1597  result = (box.height () <= 0.66 * row->xheight) ||
1598  (box.top () < baseline + row->xheight / 2.0) ||
1599  (box.bottom () > baseline + row->xheight / 2.0);
1600  return result;
1601 }
1602 
1603 
1604 void Textord::peek_at_next_gap(TO_ROW *row,
1605  BLOBNBOX_IT box_it,
1606  TBOX &next_blob_box,
1607  int16_t &next_gap,
1608  int16_t &next_within_xht_gap) {
1609  TBOX next_reduced_blob_box;
1610  TBOX bit_beyond;
1611  BLOBNBOX_IT reduced_box_it = box_it;
1612 
1613  next_blob_box = box_next (&box_it);
1614  next_reduced_blob_box = reduced_box_next (row, &reduced_box_it);
1615  if (box_it.at_first ()) {
1616  next_gap = INT16_MAX;
1617  next_within_xht_gap = INT16_MAX;
1618  }
1619  else {
1620  bit_beyond = box_it.data ()->bounding_box ();
1621  next_gap = bit_beyond.left () - next_blob_box.right ();
1622  bit_beyond = reduced_box_next (row, &reduced_box_it);
1623  next_within_xht_gap =
1624  bit_beyond.left () - next_reduced_blob_box.right ();
1625  }
1626 }
1627 
1628 
1629 #ifndef GRAPHICS_DISABLED
1630 void Textord::mark_gap(
1631  TBOX blob, // blob following gap
1632  int16_t rule, // heuristic id
1633  int16_t prev_gap,
1634  int16_t prev_blob_width,
1635  int16_t current_gap,
1636  int16_t next_blob_width,
1637  int16_t next_gap) {
1638  ScrollView::Color col; //of ellipse marking flipped gap
1639 
1640  switch (rule) {
1641  case 1:
1642  col = ScrollView::RED;
1643  break;
1644  case 2:
1645  col = ScrollView::CYAN;
1646  break;
1647  case 3:
1648  col = ScrollView::GREEN;
1649  break;
1650  case 4:
1651  col = ScrollView::BLACK;
1652  break;
1653  case 5:
1654  col = ScrollView::MAGENTA;
1655  break;
1656  case 6:
1657  col = ScrollView::BLUE;
1658  break;
1659 
1660  case 7:
1661  col = ScrollView::WHITE;
1662  break;
1663  case 8:
1664  col = ScrollView::YELLOW;
1665  break;
1666  case 9:
1667  col = ScrollView::BLACK;
1668  break;
1669 
1670  case 20:
1671  col = ScrollView::CYAN;
1672  break;
1673  case 21:
1674  col = ScrollView::GREEN;
1675  break;
1676  case 22:
1677  col = ScrollView::MAGENTA;
1678  break;
1679  default:
1680  col = ScrollView::BLACK;
1681  }
1683  to_win->Pen(col);
1684  /* if (rule < 20)
1685  //interior_style(to_win, INT_SOLID, false);
1686  else
1687  //interior_style(to_win, INT_HOLLOW, true);*/
1688  //x radius
1689  to_win->Ellipse (current_gap / 2.0f,
1690  blob.height () / 2.0f, //y radius
1691  //x centre
1692  blob.left () - current_gap / 2.0f,
1693  //y centre
1694  blob.bottom () + blob.height () / 2.0f);
1695  }
1696  if (tosp_debug_level > 5)
1697  tprintf(" (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n",
1698  blob.left() - current_gap / 2, blob.bottom(), rule, prev_gap,
1699  prev_blob_width, current_gap, next_blob_width, next_gap);
1700 }
1701 #endif
1702 
1703 float Textord::find_mean_blob_spacing(WERD *word) {
1704  C_BLOB_IT cblob_it;
1705  TBOX blob_box;
1706  int32_t gap_sum = 0;
1707  int16_t gap_count = 0;
1708  int16_t prev_right;
1709 
1710  cblob_it.set_to_list (word->cblob_list ());
1711  if (!cblob_it.empty ()) {
1712  cblob_it.mark_cycle_pt ();
1713  prev_right = cblob_it.data ()->bounding_box ().right ();
1714  //first blob
1715  cblob_it.forward ();
1716  for (; !cblob_it.cycled_list (); cblob_it.forward ()) {
1717  blob_box = cblob_it.data ()->bounding_box ();
1718  gap_sum += blob_box.left () - prev_right;
1719  gap_count++;
1720  prev_right = blob_box.right ();
1721  }
1722  }
1723  if (gap_count > 0)
1724  return (gap_sum / static_cast<float>(gap_count));
1725  else
1726  return 0.0f;
1727 }
1728 
1729 
1730 bool Textord::ignore_big_gap(TO_ROW* row,
1731  int32_t row_length,
1732  GAPMAP* gapmap,
1733  int16_t left,
1734  int16_t right) {
1735  int16_t gap = right - left + 1;
1736 
1737  if (tosp_ignore_big_gaps > 999) return false; // Don't ignore
1738  if (tosp_ignore_big_gaps > 0)
1739  return (gap > tosp_ignore_big_gaps * row->xheight);
1740  if (gap > tosp_ignore_very_big_gaps * row->xheight)
1741  return true;
1742  if (tosp_ignore_big_gaps == 0) {
1743  if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight))
1744  return true;
1745  if ((gap > 1.75 * row->xheight) &&
1746  ((row_length > 35 * row->xheight) ||
1747  gapmap->table_gap (left, right)))
1748  return true;
1749  }
1750  else {
1751  /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table */
1752  if ((gap > gapmap_big_gaps * row->xheight) &&
1753  gapmap->table_gap (left, right))
1754  return true;
1755  }
1756  return false;
1757 }
1758 
1759 /**********************************************************************
1760  * reduced_box_next
1761  *
1762  * Compute the bounding box of this blob with merging of x overlaps
1763  * but no pre-chopping.
1764  * Then move the iterator on to the start of the next blob.
1765  * DON'T reduce the box for small things - eg punctuation.
1766  **********************************************************************/
1767 TBOX Textord::reduced_box_next(
1768  TO_ROW *row, // current row
1769  BLOBNBOX_IT *it // iterator to blobds
1770  ) {
1771  BLOBNBOX *blob; //current blob
1772  BLOBNBOX *head_blob; //place to store box
1773  TBOX full_box; //full blob boundg box
1774  TBOX reduced_box; //box of significant part
1775  int16_t left_above_xht; //ABOVE xht left limit
1776  int16_t new_left_above_xht; //ABOVE xht left limit
1777 
1778  blob = it->data ();
1779  if (blob->red_box_set ()) {
1780  reduced_box = blob->reduced_box ();
1781  do {
1782  it->forward();
1783  blob = it->data();
1784  }
1785  while (blob->cblob() == nullptr || blob->joined_to_prev());
1786  return reduced_box;
1787  }
1788  head_blob = blob;
1789  full_box = blob->bounding_box ();
1790  reduced_box = reduced_box_for_blob (blob, row, &left_above_xht);
1791  do {
1792  it->forward ();
1793  blob = it->data ();
1794  if (blob->cblob() == nullptr)
1795  //was pre-chopped
1796  full_box += blob->bounding_box ();
1797  else if (blob->joined_to_prev ()) {
1798  reduced_box +=
1799  reduced_box_for_blob(blob, row, &new_left_above_xht);
1800  left_above_xht = std::min(left_above_xht, new_left_above_xht);
1801  }
1802  }
1803  //until next real blob
1804  while (blob->cblob() == nullptr || blob->joined_to_prev());
1805 
1806  if ((reduced_box.width () > 0) &&
1807  ((reduced_box.left () + tosp_near_lh_edge * reduced_box.width ())
1808  < left_above_xht) && (reduced_box.height () > 0.7 * row->xheight)) {
1809 #ifndef GRAPHICS_DISABLED
1812 #endif
1813  }
1814  else
1815  reduced_box = full_box;
1816  head_blob->set_reduced_box (reduced_box);
1817  return reduced_box;
1818 }
1819 
1820 
1821 /*************************************************************************
1822  * reduced_box_for_blob()
1823  * Find box for blob which is the same height and y position as the whole blob,
1824  * but whose left limit is the left most position of the blob ABOVE the
1825  * baseline and whose right limit is the right most position of the blob BELOW
1826  * the xheight.
1827  *
1828  *
1829  * !!!!!!! WONT WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
1830  * "home". Perhaps we need something which say if the width ABOVE the
1831  * xht alone includes the whole of the reduced width, then use the full
1832  * blob box - Might still fail on italic F
1833  *
1834  * Alternatively we could be a little less severe and only reduce the
1835  * left and right edges by half the difference between the full box and
1836  * the reduced box.
1837  *
1838  * NOTE that we need to rotate all the coordinates as
1839  * find_blob_limits finds the y min and max within a specified x band
1840  *************************************************************************/
1841 TBOX Textord::reduced_box_for_blob(
1842  BLOBNBOX *blob,
1843  TO_ROW *row,
1844  int16_t *left_above_xht) {
1845  float baseline;
1846  float blob_x_centre;
1847  float left_limit;
1848  float right_limit;
1849  float junk;
1850  TBOX blob_box;
1851 
1852  /* Find baseline of centre of blob */
1853 
1854  blob_box = blob->bounding_box ();
1855  blob_x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
1856  baseline = row->baseline.y (blob_x_centre);
1857 
1858  /*
1859  Find LH limit of blob ABOVE the xht. This is so that we can detect certain
1860  caps ht chars which should NOT have their box reduced: T, Y, V, W etc
1861  */
1862  left_limit = static_cast<float>(INT32_MAX);
1863  junk = static_cast<float>(-INT32_MAX);
1864  find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight),
1865  static_cast<float>(INT16_MAX), left_limit, junk);
1866  if (left_limit > junk)
1867  *left_above_xht = INT16_MAX; //No area above xht
1868  else
1869  *left_above_xht = static_cast<int16_t>(floor (left_limit));
1870  /*
1871  Find reduced LH limit of blob - the left extent of the region ABOVE the
1872  baseline.
1873  */
1874  left_limit = static_cast<float>(INT32_MAX);
1875  junk = static_cast<float>(-INT32_MAX);
1876  find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX),
1877  left_limit, junk);
1878 
1879  if (left_limit > junk)
1880  return TBOX (); //no area within xht so return empty box
1881  /*
1882  Find reduced RH limit of blob - the right extent of the region BELOW the xht.
1883  */
1884  junk = static_cast<float>(INT32_MAX);
1885  right_limit = static_cast<float>(-INT32_MAX);
1886  find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX),
1887  (baseline + row->xheight), junk, right_limit);
1888  if (junk > right_limit)
1889  return TBOX (); //no area within xht so return empty box
1890 
1891  return TBOX (ICOORD (static_cast<int16_t>(floor (left_limit)), blob_box.bottom ()),
1892  ICOORD (static_cast<int16_t>(ceil (right_limit)), blob_box.top ()));
1893 }
1894 } // namespace tesseract
TBOX
Definition: cleanapi_test.cc:19
TO_ROW::min_space
int32_t min_space
Definition: blobbox.h:662
STATS::get_total
int32_t get_total() const
Definition: statistc.h:83
tesseract::Textord::tosp_stats_use_xht_gaps
bool tosp_stats_use_xht_gaps
Definition: textord.h:291
STATS::mean
double mean() const
Definition: statistc.cpp:119
tesseract::Textord::tosp_wide_fraction
double tosp_wide_fraction
Definition: textord.h:323
TO_ROW::rep_words
WERD_LIST rep_words
Definition: blobbox.h:667
tesseract::Textord::tosp_debug_level
int tosp_debug_level
Definition: textord.h:302
tesseract::Textord::tosp_large_kerning
double tosp_large_kerning
Definition: textord.h:363
TO_ROW::space_size
float space_size
Definition: blobbox.h:666
C_BLOB::out_list
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:69
tesseract::Textord::tosp_fuzzy_space_factor1
double tosp_fuzzy_space_factor1
Definition: textord.h:329
tesseract::Textord::tosp_all_flips_fuzzy
bool tosp_all_flips_fuzzy
Definition: textord.h:287
tesseract::Textord::tosp_min_sane_kn_sp
double tosp_min_sane_kn_sp
Definition: textord.h:353
tesseract::Textord::to_spacing
void to_spacing(ICOORD page_tr, TO_BLOCK_LIST *blocks)
Definition: tospace.cpp:43
tesseract::Textord::tosp_improve_thresh
bool tosp_improve_thresh
Definition: textord.h:301
tesseract::Textord::tosp_table_kn_sp_ratio
double tosp_table_kn_sp_ratio
Definition: textord.h:345
plot_word_decisions
void plot_word_decisions(ScrollView *win, int16_t pitch, TO_ROW *row)
Definition: drawtord.cpp:239
WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:147
tesseract::Textord::tosp_ignore_big_gaps
double tosp_ignore_big_gaps
Definition: textord.h:339
baseline
Definition: mfoutline.h:62
BLOBNBOX::red_box_set
bool red_box_set() const
Definition: blobbox.h:258
PITCH_DEF_PROP
Definition: blobbox.h:48
ICOORD
integer coordinate
Definition: points.h:30
tesseract::Textord::tosp_fuzzy_space_factor
double tosp_fuzzy_space_factor
Definition: textord.h:327
ROW::recalc_bounding_box
void recalc_bounding_box()
Definition: ocrrow.cpp:96
tesseract::Textord::tosp_near_lh_edge
double tosp_near_lh_edge
Definition: textord.h:367
TBOX::top
int16_t top() const
Definition: rect.h:57
STATS::pile_count
int32_t pile_count(int32_t value) const
Definition: statistc.h:75
TO_BLOCK
Definition: blobbox.h:691
tesseract::Textord::tosp_recovery_isolated_row_stats
bool tosp_recovery_isolated_row_stats
Definition: textord.h:285
ScrollView::CYAN
Definition: scrollview.h:107
tesseract::Textord::tosp_rule_9_test_punct
bool tosp_rule_9_test_punct
Definition: textord.h:297
ScrollView::Pen
void Pen(Color color)
Definition: scrollview.cpp:717
PITCH_CORR_PROP
Definition: blobbox.h:51
tesseract::Textord::tosp_kern_gap_factor2
double tosp_kern_gap_factor2
Definition: textord.h:336
IntCastRounded
int IntCastRounded(double x)
Definition: helpers.h:173
tesseract::Textord::tosp_table_fuzzy_kn_sp_ratio
double tosp_table_fuzzy_kn_sp_ratio
Definition: textord.h:349
TO_ROW::pitch_decision
PITCH_TYPE pitch_decision
Definition: blobbox.h:649
tesseract::Textord::tosp_short_row
int tosp_short_row
Definition: textord.h:310
tesseract::Textord::tosp_row_use_cert_spaces
bool tosp_row_use_cert_spaces
Definition: textord.h:279
FCOORD
Definition: points.h:187
BLOBNBOX
Definition: blobbox.h:142
tesseract::Textord::make_blob_words
ROW * make_blob_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:1177
ScrollView::BLUE
Definition: scrollview.h:108
tesseract::Textord::tosp_narrow_blobs_not_cert
bool tosp_narrow_blobs_not_cert
Definition: textord.h:281
tesseract::Textord::tosp_flip_fuzz_kn_to_sp
bool tosp_flip_fuzz_kn_to_sp
Definition: textord.h:298
textord_show_initial_words
bool textord_show_initial_words
Definition: tovars.cpp:22
TBOX::height
int16_t height() const
Definition: rect.h:107
WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:94
tesseract::Textord::tosp_redo_kern_limit
int tosp_redo_kern_limit
Definition: textord.h:306
statistc.h
tovars.h
tesseract::Textord::tosp_only_use_xht_gaps
bool tosp_only_use_xht_gaps
Definition: textord.h:295
tesseract::Textord::tosp_sanity_method
int tosp_sanity_method
Definition: textord.h:311
tesseract::Textord::tosp_only_small_gaps_for_kern
bool tosp_only_small_gaps_for_kern
Definition: textord.h:286
tesseract::Textord::tosp_table_xht_sp_ratio
double tosp_table_xht_sp_ratio
Definition: textord.h:347
tesseract::Textord::tosp_threshold_bias2
double tosp_threshold_bias2
Definition: textord.h:318
ScrollView::BLACK
Definition: scrollview.h:102
W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:54
tesseract::Textord::tosp_row_use_cert_spaces1
bool tosp_row_use_cert_spaces1
Definition: textord.h:283
tesseract::Textord::tosp_fuzzy_limit_all
bool tosp_fuzzy_limit_all
Definition: textord.h:289
tesseract::Textord::tosp_old_to_constrain_sp_kn
bool tosp_old_to_constrain_sp_kn
Definition: textord.h:266
tesseract::Textord::tosp_enough_small_gaps
double tosp_enough_small_gaps
Definition: textord.h:343
tesseract::Textord::tosp_narrow_fraction
double tosp_narrow_fraction
Definition: textord.h:320
TBOX::null_box
bool null_box() const
Definition: rect.h:49
ScrollView::MAGENTA
Definition: scrollview.h:109
WERD::set_flag
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:117
W_EOL
end of line
Definition: werd.h:47
BLOBNBOX::joined_to_prev
bool joined_to_prev() const
Definition: blobbox.h:255
tesseract::Textord::tosp_old_sp_kn_th_factor
double tosp_old_sp_kn_th_factor
Definition: textord.h:314
tesseract::Textord::tosp_dont_fool_with_small_kerns
double tosp_dont_fool_with_small_kerns
Definition: textord.h:365
TBOX::width
int16_t width() const
Definition: rect.h:114
tesseract::Textord::tosp_pass_wide_fuzz_sp_to_context
double tosp_pass_wide_fuzz_sp_to_context
Definition: textord.h:371
tesseract::Textord::tosp_only_use_prop_rows
bool tosp_only_use_prop_rows
Definition: textord.h:268
MAXSPACING
#define MAXSPACING
Definition: tospace.cpp:40
ScrollView::YELLOW
Definition: scrollview.h:105
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
tesseract::Textord::tosp_fuzzy_sp_fraction
double tosp_fuzzy_sp_fraction
Definition: textord.h:351
textord.h
tesseract::Textord::tosp_old_to_bug_fix
bool tosp_old_to_bug_fix
Definition: textord.h:275
ScrollView::WHITE
Definition: scrollview.h:103
ROW::bounding_box
TBOX bounding_box() const
Definition: ocrrow.h:87
GAPMAP
Definition: gap_map.h:16
TO_ROW::fixed_pitch
float fixed_pitch
Definition: blobbox.h:650
tesseract::Textord::tosp_wide_aspect_ratio
double tosp_wide_aspect_ratio
Definition: textord.h:325
TO_ROW::xheight
float xheight
Definition: blobbox.h:656
tesseract
Definition: baseapi.h:65
STATS::median
double median() const
Definition: statistc.cpp:218
tesseract::Textord::tosp_force_wordbreak_on_punct
bool tosp_force_wordbreak_on_punct
Definition: textord.h:271
tesseract::Textord::tosp_use_pre_chopping
bool tosp_use_pre_chopping
Definition: textord.h:273
tesseract::Textord::tosp_init_guess_kn_mult
double tosp_init_guess_kn_mult
Definition: textord.h:355
TBOX::plot
void plot(ScrollView *fd) const
Definition: rect.h:285
ScrollView::RED
Definition: scrollview.h:104
STATS
Definition: statistc.h:30
BLOBNBOX::bounding_box
const TBOX & bounding_box() const
Definition: blobbox.h:229
tesseract::Textord::tosp_few_samples
int tosp_few_samples
Definition: textord.h:308
tesseract::Textord::tosp_flip_fuzz_sp_to_kn
bool tosp_flip_fuzz_sp_to_kn
Definition: textord.h:299
drawtord.h
BLOBNBOX::set_reduced_box
void set_reduced_box(TBOX new_box)
Definition: blobbox.h:248
tesseract::Textord::make_prop_words
ROW * make_prop_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:885
tesseract::Textord::tosp_max_sane_kn_thresh
double tosp_max_sane_kn_thresh
Definition: textord.h:359
TO_BLOCK::get_rows
TO_ROW_LIST * get_rows()
Definition: blobbox.h:703
box_next_pre_chopped
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
Definition: blobbox.cpp:657
STATS::ile
double ile(double frac) const
Definition: statistc.cpp:156
tesseract::Textord::tosp_use_xht_gaps
bool tosp_use_xht_gaps
Definition: textord.h:293
TO_ROW::space_threshold
int32_t space_threshold
Definition: blobbox.h:664
tesseract::Textord::tosp_ignore_very_big_gaps
double tosp_ignore_very_big_gaps
Definition: textord.h:340
TO_ROW::max_nonspace
int32_t max_nonspace
Definition: blobbox.h:663
QSPLINE::y
double y(double x) const
Definition: quspline.cpp:202
box_next
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:629
tesseract::Textord::tosp_fuzzy_space_factor2
double tosp_fuzzy_space_factor2
Definition: textord.h:331
W_FUZZY_SP
fuzzy space
Definition: werd.h:53
tesseract::Textord::tosp_rep_space
double tosp_rep_space
Definition: textord.h:341
tesseract::Textord::tosp_block_use_cert_spaces
bool tosp_block_use_cert_spaces
Definition: textord.h:277
WERD
Definition: werd.h:55
tesseract::Textord::tosp_kern_gap_factor1
double tosp_kern_gap_factor1
Definition: textord.h:334
TBOX::left
int16_t left() const
Definition: rect.h:71
ROW
Definition: ocrrow.h:35
STATS::add
void add(int32_t value, int32_t count)
Definition: statistc.cpp:87
ScrollView::GREEN
Definition: scrollview.h:106
tesseract::Textord::tosp_flip_caution
double tosp_flip_caution
Definition: textord.h:361
tesseract::Textord::tosp_enough_space_samples_for_median
int tosp_enough_space_samples_for_median
Definition: textord.h:304
TBOX::right
int16_t right() const
Definition: rect.h:78
tesseract::Textord::tosp_narrow_aspect_ratio
double tosp_narrow_aspect_ratio
Definition: textord.h:322
tesseract::Textord::tosp_old_to_method
bool tosp_old_to_method
Definition: textord.h:263
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
WERD::set_blanks
void set_blanks(uint8_t new_blanks)
Definition: werd.h:101
tesseract::Textord::tosp_fuzzy_kn_fraction
double tosp_fuzzy_kn_fraction
Definition: textord.h:350
TO_ROW
Definition: blobbox.h:543
TO_ROW::kern_size
float kern_size
Definition: blobbox.h:665
gapmap_big_gaps
double gapmap_big_gaps
Definition: gap_map.cpp:18
ScrollView::Color
Color
Definition: scrollview.h:100
BLOBNBOX::cblob
C_BLOB * cblob() const
Definition: blobbox.h:267
TO_ROW::baseline
QSPLINE baseline
Definition: blobbox.h:669
tesseract::Textord::tosp_gap_factor
double tosp_gap_factor
Definition: textord.h:332
ScrollView::Ellipse
void Ellipse(int x, int y, int width, int height)
Definition: scrollview.cpp:608
BLOBNBOX::reduced_box
const TBOX & reduced_box() const
Definition: blobbox.h:245
ROW::word_list
WERD_LIST * word_list()
Definition: ocrrow.h:54
tesseract::Textord::tosp_silly_kn_sp_gap
double tosp_silly_kn_sp_gap
Definition: textord.h:369
tesseract::Textord::tosp_init_guess_xht_mult
double tosp_init_guess_xht_mult
Definition: textord.h:357
to_win
ScrollView * to_win
Definition: drawtord.cpp:34
find_cblob_hlimits
void find_cblob_hlimits(C_BLOB *blob, float bottomy, float topy, float &xmin, float &xmax)
Definition: blobbox.cpp:571
GAPMAP::table_gap
bool table_gap(int16_t left, int16_t right)
Definition: gap_map.cpp:159
tesseract::Textord::tosp_threshold_bias1
double tosp_threshold_bias1
Definition: textord.h:316
W_BOL
start of line
Definition: werd.h:46
tesseract::Textord::tosp_kern_gap_factor3
double tosp_kern_gap_factor3
Definition: textord.h:338
TO_ROW::blob_list
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:599
TBOX
Definition: rect.h:33