tesseract  4.0.0-1-g2a2b
tordmain.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tordmain.cpp (Formerly textordp.c)
3  * Description: C++ top level textord code.
4  * Author: Ray Smith
5  * Created: Tue Jul 28 17:12:33 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include "tordmain.h"
25 #include <cfloat> // for FLT_MAX
26 #include <cmath> // for ceil, floor, M_PI
27 #include <cstdint> // for INT16_MAX, uint32_t, int32_t, int16_t
28 #include "allheaders.h" // for pixDestroy, pixGetHeight, boxCreate
29 #include "arrayaccess.h" // for GET_DATA_BYTE
30 #include "blobbox.h" // for BLOBNBOX_IT, BLOBNBOX, TO_BLOCK, TO_B...
31 #include "ccstruct.h" // for CCStruct, CCStruct::kXHeightFraction
32 #include "clst.h" // for CLISTIZE
33 #include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST, C_OUTLINE
34 #include "drawtord.h" // for plot_box_list, to_win, create_to_win
35 #include "edgblob.h" // for extract_edges
36 #include "errcode.h" // for set_global_loc_code, ASSERT_HOST, LOC...
37 #include "genericvector.h" // for PointerVector, GenericVector
38 #include "makerow.h" // for textord_test_x, textord_test_y, texto...
39 #include "morph.h" // for L_BOUNDARY_BG
40 #include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
41 #include "ocrrow.h" // for ROW, ROW_IT, ROW_LIST, tweak_row_base...
42 #include "params.h" // for DoubleParam, BoolParam, IntParam
43 #include "pdblock.h" // for PDBLK
44 #include "points.h" // for FCOORD, ICOORD
45 #include "polyblk.h" // for POLY_BLOCK
46 #include "quadratc.h" // for QUAD_COEFFS
47 #include "quspline.h" // for QSPLINE, tweak_row_baseline
48 #include "rect.h" // for TBOX
49 #include "scrollview.h" // for ScrollView, ScrollView::WHITE
50 #include "statistc.h" // for STATS
51 #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
52 #include "textord.h" // for Textord, WordWithBox, WordGrid, WordS...
53 #include "tprintf.h" // for tprintf
54 #include "werd.h" // for WERD_IT, WERD, WERD_LIST, W_DONT_CHOP
55 
56 struct Box;
57 
58 #define MAX_NEAREST_DIST 600 //for block skew stats
59 
60 namespace tesseract {
61 
62 CLISTIZE(WordWithBox)
63 
64 /**********************************************************************
65  * SetBlobStrokeWidth
66  *
67  * Set the horizontal and vertical stroke widths in the blob.
68  **********************************************************************/
69 void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) {
70  // Cut the blob rectangle into a Pix.
71  int pix_height = pixGetHeight(pix);
72  const TBOX& box = blob->bounding_box();
73  int width = box.width();
74  int height = box.height();
75  Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(),
76  width, height);
77  Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, nullptr);
78  boxDestroy(&blob_pix_box);
79  Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
80  pixDestroy(&pix_blob);
81  // Compute the stroke widths.
82  uint32_t* data = pixGetData(dist_pix);
83  int wpl = pixGetWpl(dist_pix);
84  // Horizontal width of stroke.
85  STATS h_stats(0, width + 1);
86  for (int y = 0; y < height; ++y) {
87  uint32_t* pixels = data + y*wpl;
88  int prev_pixel = 0;
89  int pixel = GET_DATA_BYTE(pixels, 0);
90  for (int x = 1; x < width; ++x) {
91  int next_pixel = GET_DATA_BYTE(pixels, x);
92  // We are looking for a pixel that is equal to its vertical neighbours,
93  // yet greater than its left neighbour.
94  if (prev_pixel < pixel &&
95  (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
96  (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
97  if (pixel > next_pixel) {
98  // Single local max, so an odd width.
99  h_stats.add(pixel * 2 - 1, 1);
100  } else if (pixel == next_pixel && x + 1 < width &&
101  pixel > GET_DATA_BYTE(pixels, x + 1)) {
102  // Double local max, so an even width.
103  h_stats.add(pixel * 2, 1);
104  }
105  }
106  prev_pixel = pixel;
107  pixel = next_pixel;
108  }
109  }
110  // Vertical width of stroke.
111  STATS v_stats(0, height + 1);
112  for (int x = 0; x < width; ++x) {
113  int prev_pixel = 0;
114  int pixel = GET_DATA_BYTE(data, x);
115  for (int y = 1; y < height; ++y) {
116  uint32_t* pixels = data + y*wpl;
117  int next_pixel = GET_DATA_BYTE(pixels, x);
118  // We are looking for a pixel that is equal to its horizontal neighbours,
119  // yet greater than its upper neighbour.
120  if (prev_pixel < pixel &&
121  (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
122  (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
123  if (pixel > next_pixel) {
124  // Single local max, so an odd width.
125  v_stats.add(pixel * 2 - 1, 1);
126  } else if (pixel == next_pixel && y + 1 < height &&
127  pixel > GET_DATA_BYTE(pixels + wpl, x)) {
128  // Double local max, so an even width.
129  v_stats.add(pixel * 2, 1);
130  }
131  }
132  prev_pixel = pixel;
133  pixel = next_pixel;
134  }
135  }
136  pixDestroy(&dist_pix);
137  // Store the horizontal and vertical width in the blob, keeping both
138  // widths if there is enough information, otherwse only the one with
139  // the most samples.
140  // If there are insufficient samples, store zero, rather than using
141  // 2*area/perimeter, as the numbers that gives do not match the numbers
142  // from the distance method.
143  if (h_stats.get_total() >= (width + height) / 4) {
144  blob->set_horz_stroke_width(h_stats.ile(0.5f));
145  if (v_stats.get_total() >= (width + height) / 4)
146  blob->set_vert_stroke_width(v_stats.ile(0.5f));
147  else
148  blob->set_vert_stroke_width(0.0f);
149  } else {
150  if (v_stats.get_total() >= (width + height) / 4 ||
151  v_stats.get_total() > h_stats.get_total()) {
152  blob->set_horz_stroke_width(0.0f);
153  blob->set_vert_stroke_width(v_stats.ile(0.5f));
154  } else {
155  blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
156  : 0.0f);
157  blob->set_vert_stroke_width(0.0f);
158  }
159  }
160 }
161 
162 /**********************************************************************
163  * assign_blobs_to_blocks2
164  *
165  * Make a list of TO_BLOCKs for portrait and landscape orientation.
166  **********************************************************************/
167 
169  BLOCK_LIST *blocks, // blocks to process
170  TO_BLOCK_LIST *port_blocks) { // output list
171  BLOCK *block; // current block
172  BLOBNBOX *newblob; // created blob
173  C_BLOB *blob; // current blob
174  BLOCK_IT block_it = blocks;
175  C_BLOB_IT blob_it; // iterator
176  BLOBNBOX_IT port_box_it; // iterator
177  // destination iterator
178  TO_BLOCK_IT port_block_it = port_blocks;
179  TO_BLOCK *port_block; // created block
180 
181  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
182  block = block_it.data();
183  port_block = new TO_BLOCK(block);
184 
185  // Convert the good outlines to block->blob_list
186  port_box_it.set_to_list(&port_block->blobs);
187  blob_it.set_to_list(block->blob_list());
188  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
189  blob = blob_it.extract();
190  newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
191  SetBlobStrokeWidth(pix, newblob);
192  port_box_it.add_after_then_move(newblob);
193  }
194 
195  // Put the rejected outlines in block->noise_blobs, which allows them to
196  // be reconsidered and sorted back into rows and recover outlines mistakenly
197  // rejected.
198  port_box_it.set_to_list(&port_block->noise_blobs);
199  blob_it.set_to_list(block->reject_blobs());
200  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
201  blob = blob_it.extract();
202  newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
203  SetBlobStrokeWidth(pix, newblob);
204  port_box_it.add_after_then_move(newblob);
205  }
206 
207  port_block_it.add_after_then_move(port_block);
208  }
209 }
210 
211 /**********************************************************************
212  * find_components
213  *
214  * Find the C_OUTLINEs of the connected components in each block, put them
215  * in C_BLOBs, and filter them by size, putting the different size
216  * grades on different lists in the matching TO_BLOCK in to_blocks.
217  **********************************************************************/
218 
219 void Textord::find_components(Pix* pix, BLOCK_LIST *blocks,
220  TO_BLOCK_LIST *to_blocks) {
221  int width = pixGetWidth(pix);
222  int height = pixGetHeight(pix);
223  if (width > INT16_MAX || height > INT16_MAX) {
224  tprintf("Input image too large! (%d, %d)\n", width, height);
225  return; // Can't handle it.
226  }
227 
229 
230  BLOCK_IT block_it(blocks); // iterator
231  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
232  block_it.forward()) {
233  BLOCK* block = block_it.data();
234  if (block->pdblk.poly_block() == nullptr || block->pdblk.poly_block()->IsText()) {
235  extract_edges(pix, block);
236  }
237  }
238 
239  assign_blobs_to_blocks2(pix, blocks, to_blocks);
240  ICOORD page_tr(width, height);
241  filter_blobs(page_tr, to_blocks, !textord_test_landscape);
242 }
243 
244 /**********************************************************************
245  * filter_blobs
246  *
247  * Sort the blobs into sizes in all the blocks for later work.
248  **********************************************************************/
249 
250 void Textord::filter_blobs(ICOORD page_tr, // top right
251  TO_BLOCK_LIST* blocks, // output list
252  bool testing_on) { // for plotting
253  TO_BLOCK_IT block_it = blocks; // destination iterator
254  TO_BLOCK *block; // created block
255 
256  #ifndef GRAPHICS_DISABLED
257  if (to_win != nullptr)
258  to_win->Clear();
259  #endif // GRAPHICS_DISABLED
260 
261  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
262  block_it.forward()) {
263  block = block_it.data();
264  block->line_size = filter_noise_blobs(&block->blobs,
265  &block->noise_blobs,
266  &block->small_blobs,
267  &block->large_blobs);
268  if (block->line_size == 0) block->line_size = 1;
269  block->line_spacing = block->line_size *
276 
277  #ifndef GRAPHICS_DISABLED
278  if (textord_show_blobs && testing_on) {
279  if (to_win == nullptr)
280  create_to_win(page_tr);
281  block->plot_graded_blobs(to_win);
282  }
283  if (textord_show_boxes && testing_on) {
284  if (to_win == nullptr)
285  create_to_win(page_tr);
290  }
291  #endif // GRAPHICS_DISABLED
292  }
293 }
294 
295 /**********************************************************************
296  * filter_noise_blobs
297  *
298  * Move small blobs to a separate list.
299  **********************************************************************/
300 
301 float Textord::filter_noise_blobs(
302  BLOBNBOX_LIST *src_list, // original list
303  BLOBNBOX_LIST *noise_list, // noise list
304  BLOBNBOX_LIST *small_list, // small blobs
305  BLOBNBOX_LIST *large_list) { // large blobs
306  int16_t height; //height of blob
307  int16_t width; //of blob
308  BLOBNBOX *blob; //current blob
309  float initial_x; //first guess
310  BLOBNBOX_IT src_it = src_list; //iterators
311  BLOBNBOX_IT noise_it = noise_list;
312  BLOBNBOX_IT small_it = small_list;
313  BLOBNBOX_IT large_it = large_list;
314  STATS size_stats (0, MAX_NEAREST_DIST);
315  //blob heights
316  float min_y; //size limits
317  float max_y;
318  float max_x;
319  float max_height; //of good blobs
320 
321  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
322  blob = src_it.data();
324  noise_it.add_after_then_move(src_it.extract());
325  else if (blob->enclosed_area() >= blob->bounding_box().height()
327  small_it.add_after_then_move(src_it.extract());
328  }
329  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
330  size_stats.add(src_it.data()->bounding_box().height(), 1);
331  }
332  initial_x = size_stats.ile(textord_initialx_ile);
333  max_y = ceil(initial_x *
338  min_y = floor (initial_x / 2);
339  max_x = ceil (initial_x * textord_width_limit);
340  small_it.move_to_first ();
341  for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
342  small_it.forward ()) {
343  height = small_it.data()->bounding_box().height();
344  if (height > max_y)
345  large_it.add_after_then_move(small_it.extract ());
346  else if (height >= min_y)
347  src_it.add_after_then_move(small_it.extract ());
348  }
349  size_stats.clear ();
350  for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
351  height = src_it.data ()->bounding_box ().height ();
352  width = src_it.data ()->bounding_box ().width ();
353  if (height < min_y)
354  small_it.add_after_then_move (src_it.extract ());
355  else if (height > max_y || width > max_x)
356  large_it.add_after_then_move (src_it.extract ());
357  else
358  size_stats.add (height, 1);
359  }
360  max_height = size_stats.ile (textord_initialasc_ile);
361  // tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
362  // max_y,min_y,initial_x,max_height);
364  if (max_height > initial_x)
365  initial_x = max_height;
366  // tprintf(" ret=%g\n",initial_x);
367  return initial_x;
368 }
369 
370 // Fixes the block so it obeys all the rules:
371 // Must have at least one ROW.
372 // Must have at least one WERD.
373 // WERDs contain a fake blob.
374 void Textord::cleanup_nontext_block(BLOCK* block) {
375  // Non-text blocks must contain at least one row.
376  ROW_IT row_it(block->row_list());
377  if (row_it.empty()) {
378  const TBOX& box = block->pdblk.bounding_box();
379  float height = box.height();
380  int32_t xstarts[2] = {box.left(), box.right()};
381  double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
382  ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f,
383  height / 4.0f, 0, 1);
384  row_it.add_after_then_move(row);
385  }
386  // Each row must contain at least one word.
387  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
388  ROW* row = row_it.data();
389  WERD_IT w_it(row->word_list());
390  if (w_it.empty()) {
391  // Make a fake blob to put in the word.
392  TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box()
393  : row->bounding_box();
394  C_BLOB* blob = C_BLOB::FakeBlob(box);
395  C_BLOB_LIST blobs;
396  C_BLOB_IT blob_it(&blobs);
397  blob_it.add_after_then_move(blob);
398  WERD* word = new WERD(&blobs, 0, nullptr);
399  w_it.add_after_then_move(word);
400  }
401  // Each word must contain a fake blob.
402  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
403  WERD* word = w_it.data();
404  // Just assert that this is true, as it would be useful to find
405  // out why it isn't.
406  ASSERT_HOST(!word->cblob_list()->empty());
407  }
408  row->recalc_bounding_box();
409  }
410 }
411 
412 /**********************************************************************
413  * cleanup_blocks
414  *
415  * Delete empty blocks, rows from the page.
416  **********************************************************************/
417 
418 void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) {
419  BLOCK_IT block_it = blocks; //iterator
420  ROW_IT row_it; //row iterator
421 
422  int num_rows = 0;
423  int num_rows_all = 0;
424  int num_blocks = 0;
425  int num_blocks_all = 0;
426  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
427  block_it.forward()) {
428  BLOCK* block = block_it.data();
429  if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
430  cleanup_nontext_block(block);
431  continue;
432  }
433  num_rows = 0;
434  num_rows_all = 0;
435  if (clean_noise) {
436  row_it.set_to_list(block->row_list());
437  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
438  ROW* row = row_it.data();
439  ++num_rows_all;
440  clean_small_noise_from_words(row);
441  if ((textord_noise_rejrows && !row->word_list()->empty() &&
442  clean_noise_from_row(row)) ||
443  row->word_list()->empty()) {
444  delete row_it.extract(); // lose empty row.
445  } else {
447  clean_noise_from_words(row_it.data());
448  if (textord_blshift_maxshift >= 0)
451  ++num_rows;
452  }
453  }
454  }
455  if (block->row_list()->empty()) {
456  delete block_it.extract(); // Lose empty text blocks.
457  } else {
458  ++num_blocks;
459  }
460  ++num_blocks_all;
462  tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all);
463  }
465  tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all);
466 }
467 
468 
469 /**********************************************************************
470  * clean_noise_from_row
471  *
472  * Move blobs of words from rows of garbage into the reject blobs list.
473  **********************************************************************/
474 
475 bool Textord::clean_noise_from_row( //remove empties
476  ROW* row //row to clean
477 ) {
478  bool testing_on;
479  TBOX blob_box; //bounding box
480  C_BLOB *blob; //current blob
481  C_OUTLINE *outline; //current outline
482  WERD *word; //current word
483  int32_t blob_size; //biggest size
484  int32_t trans_count = 0; //no of transitions
485  int32_t trans_threshold; //noise tolerance
486  int32_t dot_count; //small objects
487  int32_t norm_count; //normal objects
488  int32_t super_norm_count; //real char-like
489  //words of row
490  WERD_IT word_it = row->word_list ();
491  C_BLOB_IT blob_it; //blob iterator
492  C_OUTLINE_IT out_it; //outline iterator
493 
494  testing_on = textord_test_y > row->base_line (textord_test_x)
496  && textord_test_y < row->base_line (textord_test_x) + row->x_height ();
497  dot_count = 0;
498  norm_count = 0;
499  super_norm_count = 0;
500  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
501  word = word_it.data (); //current word
502  //blobs in word
503  blob_it.set_to_list (word->cblob_list ());
504  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
505  blob_it.forward ()) {
506  blob = blob_it.data ();
507  if (!word->flag (W_DONT_CHOP)) {
508  //get outlines
509  out_it.set_to_list (blob->out_list ());
510  for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
511  out_it.forward ()) {
512  outline = out_it.data ();
513  blob_box = outline->bounding_box ();
514  blob_size =
515  blob_box.width () >
516  blob_box.height ()? blob_box.width () : blob_box.
517  height();
518  if (blob_size < textord_noise_sizelimit * row->x_height ())
519  dot_count++; //count smal outlines
520  if (!outline->child ()->empty ()
521  && blob_box.height () <
522  (1 + textord_noise_syfract) * row->x_height ()
523  && blob_box.height () >
524  (1 - textord_noise_syfract) * row->x_height ()
525  && blob_box.width () <
526  (1 + textord_noise_sxfract) * row->x_height ()
527  && blob_box.width () >
528  (1 - textord_noise_sxfract) * row->x_height ())
529  super_norm_count++; //count smal outlines
530  }
531  }
532  else
533  super_norm_count++;
534  blob_box = blob->bounding_box ();
535  blob_size =
536  blob_box.width () >
537  blob_box.height ()? blob_box.width () : blob_box.height ();
538  if (blob_size >= textord_noise_sizelimit * row->x_height ()
539  && blob_size < row->x_height () * 2) {
540  trans_threshold = blob_size / textord_noise_sizefraction;
541  trans_count = blob->count_transitions (trans_threshold);
542  if (trans_count < textord_noise_translimit)
543  norm_count++;
544  }
545  else if (blob_box.height () > row->x_height () * 2
546  && (!word_it.at_first () || !blob_it.at_first ()))
547  dot_count += 2;
548  if (testing_on) {
549  tprintf
550  ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
551  blob_box.left (), blob_box.bottom (), blob_box.right (),
552  blob_box.top (), blob->out_list ()->length (), trans_count,
553  blob_box.bottom () - row->base_line (blob_box.left ()));
554  }
555  }
556  }
557  if (textord_noise_debug) {
558  tprintf ("Row ending at (%d,%g):",
559  blob_box.right (), row->base_line (blob_box.right ()));
560  tprintf (" R=%g, dc=%d, nc=%d, %s\n",
561  norm_count > 0 ? (float) dot_count / norm_count : 9999,
562  dot_count, norm_count,
563  dot_count > norm_count * textord_noise_normratio
564  && dot_count > 2 ? "REJECTED" : "ACCEPTED");
565  }
566  return super_norm_count < textord_noise_sncount
567  && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
568 }
569 
570 /**********************************************************************
571  * clean_noise_from_words
572  *
573  * Move blobs of words from rows of garbage into the reject blobs list.
574  **********************************************************************/
575 
576 void Textord::clean_noise_from_words( //remove empties
577  ROW *row //row to clean
578  ) {
579  TBOX blob_box; //bounding box
580  C_BLOB *blob; //current blob
581  C_OUTLINE *outline; //current outline
582  WERD *word; //current word
583  int32_t blob_size; //biggest size
584  int32_t trans_count; //no of transitions
585  int32_t trans_threshold; //noise tolerance
586  int32_t dot_count; //small objects
587  int32_t norm_count; //normal objects
588  int32_t dud_words; //number discarded
589  int32_t ok_words; //number remaining
590  int32_t word_index; //current word
591  //words of row
592  WERD_IT word_it = row->word_list ();
593  C_BLOB_IT blob_it; //blob iterator
594  C_OUTLINE_IT out_it; //outline iterator
595 
596  ok_words = word_it.length ();
597  if (ok_words == 0 || textord_no_rejects)
598  return;
599  // was it chucked
600  std::vector<int8_t> word_dud(ok_words);
601  dud_words = 0;
602  ok_words = 0;
603  word_index = 0;
604  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
605  word = word_it.data (); //current word
606  dot_count = 0;
607  norm_count = 0;
608  //blobs in word
609  blob_it.set_to_list (word->cblob_list ());
610  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
611  blob_it.forward ()) {
612  blob = blob_it.data ();
613  if (!word->flag (W_DONT_CHOP)) {
614  //get outlines
615  out_it.set_to_list (blob->out_list ());
616  for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
617  out_it.forward ()) {
618  outline = out_it.data ();
619  blob_box = outline->bounding_box ();
620  blob_size =
621  blob_box.width () >
622  blob_box.height ()? blob_box.width () : blob_box.
623  height();
624  if (blob_size < textord_noise_sizelimit * row->x_height ())
625  dot_count++; //count smal outlines
626  if (!outline->child ()->empty ()
627  && blob_box.height () <
628  (1 + textord_noise_syfract) * row->x_height ()
629  && blob_box.height () >
630  (1 - textord_noise_syfract) * row->x_height ()
631  && blob_box.width () <
632  (1 + textord_noise_sxfract) * row->x_height ()
633  && blob_box.width () >
634  (1 - textord_noise_sxfract) * row->x_height ())
635  norm_count++; //count smal outlines
636  }
637  }
638  else
639  norm_count++;
640  blob_box = blob->bounding_box ();
641  blob_size =
642  blob_box.width () >
643  blob_box.height ()? blob_box.width () : blob_box.height ();
644  if (blob_size >= textord_noise_sizelimit * row->x_height ()
645  && blob_size < row->x_height () * 2) {
646  trans_threshold = blob_size / textord_noise_sizefraction;
647  trans_count = blob->count_transitions (trans_threshold);
648  if (trans_count < textord_noise_translimit)
649  norm_count++;
650  }
651  else if (blob_box.height () > row->x_height () * 2
652  && (!word_it.at_first () || !blob_it.at_first ()))
653  dot_count += 2;
654  }
655  if (dot_count > 2 && !word->flag(W_REP_CHAR)) {
656  if (dot_count > norm_count * textord_noise_normratio * 2)
657  word_dud[word_index] = 2;
658  else if (dot_count > norm_count * textord_noise_normratio)
659  word_dud[word_index] = 1;
660  else
661  word_dud[word_index] = 0;
662  } else {
663  word_dud[word_index] = 0;
664  }
665  if (word_dud[word_index] == 2)
666  dud_words++;
667  else
668  ok_words++;
669  word_index++;
670  }
671 
672  word_index = 0;
673  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
674  if (word_dud[word_index] == 2
675  || (word_dud[word_index] == 1 && dud_words > ok_words)) {
676  word = word_it.data(); // Current word.
677  // Previously we threw away the entire word.
678  // Now just aggressively throw all small blobs into the reject list, where
679  // the classifier can decide whether they are actually needed.
681  }
682  word_index++;
683  }
684 }
685 
686 // Remove outlines that are a tiny fraction in either width or height
687 // of the word height.
688 void Textord::clean_small_noise_from_words(ROW *row) {
689  WERD_IT word_it(row->word_list());
690  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
691  WERD* word = word_it.data();
692  int min_size = static_cast<int>(
693  textord_noise_hfract * word->bounding_box().height() + 0.5);
694  C_BLOB_IT blob_it(word->cblob_list());
695  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
696  C_BLOB* blob = blob_it.data();
697  C_OUTLINE_IT out_it(blob->out_list());
698  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
699  C_OUTLINE* outline = out_it.data();
700  outline->RemoveSmallRecursive(min_size, &out_it);
701  }
702  if (blob->out_list()->empty()) {
703  delete blob_it.extract();
704  }
705  }
706  if (word->cblob_list()->empty()) {
707  if (!word_it.at_last()) {
708  // The next word is no longer a fuzzy non space if it was before,
709  // since the word before is about to be deleted.
710  WERD* next_word = word_it.data_relative(1);
711  if (next_word->flag(W_FUZZY_NON)) {
712  next_word->set_flag(W_FUZZY_NON, false);
713  }
714  }
715  delete word_it.extract();
716  }
717  }
718 }
719 
720 // Local struct to hold a group of blocks.
721 struct BlockGroup {
722  BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {}
723  explicit BlockGroup(BLOCK* block)
724  : bounding_box(block->pdblk.bounding_box()),
725  rotation(block->re_rotation()),
726  angle(block->re_rotation().angle()),
727  min_xheight(block->x_height()) {
728  blocks.push_back(block);
729  }
730  // Union of block bounding boxes.
732  // Common rotation of the blocks.
734  // Angle of rotation.
735  float angle;
736  // Min xheight of the blocks.
737  float min_xheight;
738  // Collection of borrowed pointers to the blocks in the group.
740 };
741 
742 // Groups blocks by rotation, then, for each group, makes a WordGrid and calls
743 // TransferDiacriticsToWords to copy the diacritic blobs to the most
744 // appropriate words in the group of blocks. Source blobs are not touched.
745 void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
746  BLOCK_LIST* blocks) {
747  // Angle difference larger than this is too much to consider equal.
748  // They should only be in multiples of M_PI/2 anyway.
749  const double kMaxAngleDiff = 0.01; // About 0.6 degrees.
751  BLOCK_IT bk_it(blocks);
752  for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) {
753  BLOCK* block = bk_it.data();
754  if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
755  continue;
756  }
757  // Linear search of the groups to find a matching rotation.
758  float block_angle = block->re_rotation().angle();
759  int best_g = 0;
760  float best_angle_diff = FLT_MAX;
761  for (int g = 0; g < groups.size(); ++g) {
762  double angle_diff = fabs(block_angle - groups[g]->angle);
763  if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI);
764  if (angle_diff < best_angle_diff) {
765  best_angle_diff = angle_diff;
766  best_g = g;
767  }
768  }
769  if (best_angle_diff > kMaxAngleDiff) {
770  groups.push_back(new BlockGroup(block));
771  } else {
772  groups[best_g]->blocks.push_back(block);
773  groups[best_g]->bounding_box += block->pdblk.bounding_box();
774  float x_height = block->x_height();
775  if (x_height < groups[best_g]->min_xheight)
776  groups[best_g]->min_xheight = x_height;
777  }
778  }
779  // Now process each group of blocks.
780  PointerVector<WordWithBox> word_ptrs;
781  for (int g = 0; g < groups.size(); ++g) {
782  const BlockGroup* group = groups[g];
783  if (group->bounding_box.null_box()) continue;
784  WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(),
785  group->bounding_box.topright());
786  for (int b = 0; b < group->blocks.size(); ++b) {
787  ROW_IT row_it(group->blocks[b]->row_list());
788  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
789  ROW* row = row_it.data();
790  // Put the words of the row into the grid.
791  WERD_IT w_it(row->word_list());
792  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
793  WERD* word = w_it.data();
794  WordWithBox* box_word = new WordWithBox(word);
795  word_grid.InsertBBox(true, true, box_word);
796  // Save the pointer where it will be auto-deleted.
797  word_ptrs.push_back(box_word);
798  }
799  }
800  }
801  FCOORD rotation = group->rotation;
802  // Make it a forward rotation that will transform blob coords to block.
803  rotation.set_y(-rotation.y());
804  TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid);
805  }
806 }
807 
808 // Places a copy of blobs that are near a word (after applying rotation to the
809 // blob) in the most appropriate word, unless there is doubt, in which case a
810 // blob can end up in two words. Source blobs are not touched.
811 void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs,
812  const FCOORD& rotation,
813  WordGrid* word_grid) {
814  WordSearch ws(word_grid);
815  BLOBNBOX_IT b_it(diacritic_blobs);
816  // Apply rotation to each blob before finding the nearest words. The rotation
817  // allows us to only consider above/below placement and not left/right on
818  // vertical text, because all text is horizontal here.
819  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
820  BLOBNBOX* blobnbox = b_it.data();
821  TBOX blob_box = blobnbox->bounding_box();
822  blob_box.rotate(rotation);
823  ws.StartRectSearch(blob_box);
824  // Above/below refer to word position relative to diacritic. Since some
825  // scripts eg Kannada/Telugu habitually put diacritics below words, and
826  // others eg Thai/Vietnamese/Latin put most diacritics above words, try
827  // for both if there isn't much in it.
828  WordWithBox* best_above_word = nullptr;
829  WordWithBox* best_below_word = nullptr;
830  int best_above_distance = 0;
831  int best_below_distance = 0;
832  for (WordWithBox* word = ws.NextRectSearch(); word != nullptr;
833  word = ws.NextRectSearch()) {
834  if (word->word()->flag(W_REP_CHAR)) continue;
835  TBOX word_box = word->true_bounding_box();
836  int x_distance = blob_box.x_gap(word_box);
837  int y_distance = blob_box.y_gap(word_box);
838  if (x_distance > 0) {
839  // Arbitrarily divide x-distance by 2 if there is a major y overlap,
840  // and the word is to the left of the diacritic. If the
841  // diacritic is a dropped broken character between two words, this will
842  // help send all the pieces to a single word, instead of splitting them
843  // over the 2 words.
844  if (word_box.major_y_overlap(blob_box) &&
845  blob_box.left() > word_box.right()) {
846  x_distance /= 2;
847  }
848  y_distance += x_distance;
849  }
850  if (word_box.y_middle() > blob_box.y_middle() &&
851  (best_above_word == nullptr || y_distance < best_above_distance)) {
852  best_above_word = word;
853  best_above_distance = y_distance;
854  }
855  if (word_box.y_middle() <= blob_box.y_middle() &&
856  (best_below_word == nullptr || y_distance < best_below_distance)) {
857  best_below_word = word;
858  best_below_distance = y_distance;
859  }
860  }
861  bool above_good =
862  best_above_word != nullptr &&
863  (best_below_word == nullptr ||
864  best_above_distance < best_below_distance + blob_box.height());
865  bool below_good =
866  best_below_word != nullptr && best_below_word != best_above_word &&
867  (best_above_word == nullptr ||
868  best_below_distance < best_above_distance + blob_box.height());
869  if (below_good) {
870  C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
871  copied_blob->rotate(rotation);
872  // Put the blob into the word's reject blobs list.
873  C_BLOB_IT blob_it(best_below_word->RejBlobs());
874  blob_it.add_to_end(copied_blob);
875  }
876  if (above_good) {
877  C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
878  copied_blob->rotate(rotation);
879  // Put the blob into the word's reject blobs list.
880  C_BLOB_IT blob_it(best_above_word->RejBlobs());
881  blob_it.add_to_end(copied_blob);
882  }
883  }
884 }
885 
886 } // tesseract
887 
888 /**********************************************************************
889  * tweak_row_baseline
890  *
891  * Shift baseline to fit the blobs more accurately where they are
892  * close enough.
893  **********************************************************************/
894 
896  double blshift_maxshift,
897  double blshift_xfraction) {
898  TBOX blob_box; //bounding box
899  C_BLOB *blob; //current blob
900  WERD *word; //current word
901  int32_t blob_count; //no of blobs
902  int32_t src_index; //source segment
903  int32_t dest_index; //destination segment
904  float ydiff; //baseline error
905  float x_centre; //centre of blob
906  //words of row
907  WERD_IT word_it = row->word_list ();
908  C_BLOB_IT blob_it; //blob iterator
909 
910  blob_count = 0;
911  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
912  word = word_it.data (); //current word
913  //get total blobs
914  blob_count += word->cblob_list ()->length ();
915  }
916  if (blob_count == 0)
917  return;
918  // spline segments
919  std::vector<int32_t> xstarts(blob_count + row->baseline.segments + 1);
920  // spline coeffs
921  std::vector<double> coeffs((blob_count + row->baseline.segments) * 3);
922 
923  src_index = 0;
924  dest_index = 0;
925  xstarts[0] = row->baseline.xcoords[0];
926  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
927  word = word_it.data (); //current word
928  //blobs in word
929  blob_it.set_to_list (word->cblob_list ());
930  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
931  blob_it.forward ()) {
932  blob = blob_it.data ();
933  blob_box = blob->bounding_box ();
934  x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
935  ydiff = blob_box.bottom () - row->base_line (x_centre);
936  if (ydiff < 0)
937  ydiff = -ydiff / row->x_height ();
938  else
939  ydiff = ydiff / row->x_height ();
940  if (ydiff < blshift_maxshift
941  && blob_box.height () / row->x_height () > blshift_xfraction) {
942  if (xstarts[dest_index] >= x_centre)
943  xstarts[dest_index] = blob_box.left ();
944  coeffs[dest_index * 3] = 0;
945  coeffs[dest_index * 3 + 1] = 0;
946  coeffs[dest_index * 3 + 2] = blob_box.bottom ();
947  //shift it
948  dest_index++;
949  xstarts[dest_index] = blob_box.right () + 1;
950  }
951  else {
952  if (xstarts[dest_index] <= x_centre) {
953  while (row->baseline.xcoords[src_index + 1] <= x_centre
954  && src_index < row->baseline.segments - 1) {
955  if (row->baseline.xcoords[src_index + 1] >
956  xstarts[dest_index]) {
957  coeffs[dest_index * 3] =
958  row->baseline.quadratics[src_index].a;
959  coeffs[dest_index * 3 + 1] =
960  row->baseline.quadratics[src_index].b;
961  coeffs[dest_index * 3 + 2] =
962  row->baseline.quadratics[src_index].c;
963  dest_index++;
964  xstarts[dest_index] =
965  row->baseline.xcoords[src_index + 1];
966  }
967  src_index++;
968  }
969  coeffs[dest_index * 3] =
970  row->baseline.quadratics[src_index].a;
971  coeffs[dest_index * 3 + 1] =
972  row->baseline.quadratics[src_index].b;
973  coeffs[dest_index * 3 + 2] =
974  row->baseline.quadratics[src_index].c;
975  dest_index++;
976  xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
977  }
978  }
979  }
980  }
981  while (src_index < row->baseline.segments
982  && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
983  src_index++;
984  while (src_index < row->baseline.segments) {
985  coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
986  coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
987  coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
988  dest_index++;
989  src_index++;
990  xstarts[dest_index] = row->baseline.xcoords[src_index];
991  }
992  //turn to spline
993  row->baseline = QSPLINE(dest_index, &xstarts[0], &coeffs[0]);
994 }
void CleanNoise(float size_threshold)
Definition: werd.cpp:505
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:439
double a
Definition: quadratc.h:57
double textord_min_linesize
Definition: makerow.cpp:82
void rotate(const FCOORD &vec)
Definition: rect.h:197
FCOORD re_rotation() const
Definition: ocrblock.h:136
int y_gap(const TBOX &box) const
Definition: rect.h:233
double textord_excess_blobsize
Definition: makerow.cpp:84
double textord_noise_sxfract
Definition: textord.h:392
void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction)
Definition: tordmain.cpp:895
C_BLOB_LIST * reject_blobs()
Definition: ocrblock.h:133
TBOX bounding_box() const
Definition: werd.cpp:159
float base_line(float xpos) const
Definition: ocrrow.h:59
void plot_box_list(ScrollView *win, BLOBNBOX_LIST *list, ScrollView::Color body_colour)
Definition: drawtord.cpp:69
int32_t enclosed_area() const
Definition: blobbox.h:254
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:118
Definition: rect.h:34
int textord_max_noise_size
Definition: textord.h:376
WERD_LIST * word_list()
Definition: ocrrow.h:55
int textord_noise_sizefraction
Definition: textord.h:384
#define LOC_EDGE_PROG
Definition: errcode.h:44
float angle() const
find angle
Definition: points.h:248
bool textord_show_boxes
Definition: textord.h:375
BlockGroup(BLOCK *block)
Definition: tordmain.cpp:723
bool textord_test_landscape
Definition: makerow.cpp:49
void find_components(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:219
C_BLOB_LIST * blob_list()
get blobs
Definition: ocrblock.h:130
int x_gap(const TBOX &box) const
Definition: rect.h:225
bool textord_no_rejects
Definition: textord.h:373
int textord_noise_sncount
Definition: textord.h:395
double textord_noise_normratio
Definition: textord.h:387
float c
Definition: quadratc.h:59
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:127
Definition: statistc.h:33
static const double kDescenderFraction
Definition: ccstruct.h:33
float b
Definition: quadratc.h:58
float line_spacing
Definition: blobbox.h:792
int16_t width() const
Definition: rect.h:115
static const double kXHeightFraction
Definition: ccstruct.h:34
void plot_graded_blobs(ScrollView *to_win)
Definition: blobbox.cpp:1072
float max_blob_size
Definition: blobbox.h:799
int16_t left() const
Definition: rect.h:72
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
BBGrid< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > WordGrid
Definition: textord.h:65
int16_t top() const
Definition: rect.h:58
float x_height() const
Definition: ocrrow.h:64
GridSearch< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > WordSearch
Definition: textord.h:66
integer coordinate
Definition: points.h:32
void SetBlobStrokeWidth(Pix *pix, BLOBNBOX *blob)
Definition: tordmain.cpp:69
double textord_noise_hfract
Definition: textord.h:394
double textord_blshift_maxshift
Definition: textord.h:398
double textord_width_limit
Definition: makerow.cpp:76
double ile(double frac) const
Definition: statistc.cpp:173
bool textord_show_blobs
Definition: textord.h:374
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
Definition: werd.h:59
int textord_noise_translimit
Definition: textord.h:386
TBOX bounding_box() const
Definition: ocrrow.h:88
Definition: ocrrow.h:36
const TBOX & bounding_box() const
Definition: coutln.h:113
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool IsText() const
Definition: polyblk.h:49
double textord_initialasc_ile
Definition: textord.h:383
int textord_test_y
Definition: makerow.cpp:62
Definition: ocrblock.h:30
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
double textord_noise_rowratio
Definition: textord.h:396
void add(int32_t value, int32_t count)
Definition: statistc.cpp:100
C_OUTLINE_LIST * child()
Definition: coutln.h:108
int push_back(T object)
GenericVector< BLOCK * > blocks
Definition: tordmain.cpp:739
EXTERN ScrollView * to_win
Definition: drawtord.cpp:37
void recalc_bounding_box()
Definition: ocrrow.cpp:101
void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, bool testing_on)
Definition: tordmain.cpp:250
#define MAX_NEAREST_DIST
Definition: tordmain.cpp:58
int textord_test_x
Definition: makerow.cpp:61
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:80
void assign_blobs_to_blocks2(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks)
Definition: tordmain.cpp:168
TBOX bounding_box() const
Definition: stepblob.cpp:255
double textord_blshift_xfraction
Definition: textord.h:399
static const double kAscenderFraction
Definition: ccstruct.h:35
double textord_noise_area_ratio
Definition: textord.h:380
void RemoveSmallRecursive(int min_size, C_OUTLINE_IT *it)
Definition: coutln.cpp:628
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70
static const double kXHeightCapRatio
Definition: ccstruct.h:37
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:60
Definition: points.h:189
const TBOX & bounding_box() const
Definition: blobbox.h:231
void Clear()
Definition: scrollview.cpp:591
int16_t right() const
Definition: rect.h:79
BLOBNBOX_LIST blobs
Definition: blobbox.h:785
int y_middle() const
Definition: rect.h:88
int32_t count_transitions(int32_t threshold)
Definition: stepblob.cpp:335
bool textord_noise_rejwords
Definition: textord.h:388
ScrollView * create_to_win(ICOORD page_tr)
Definition: drawtord.cpp:46
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:789
bool textord_noise_rejrows
Definition: textord.h:389
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:243
int16_t bottom() const
Definition: rect.h:65
void set_y(float yin)
rewrite function
Definition: points.h:219
double textord_initialx_ile
Definition: textord.h:382
double textord_noise_sizelimit
Definition: textord.h:385
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:334
double textord_noise_syfract
Definition: textord.h:390
PDBLK pdblk
Definition: ocrblock.h:192
int16_t height() const
Definition: rect.h:108
C_BLOB * cblob() const
Definition: blobbox.h:269
void rotate(const FCOORD &rotation)
Definition: stepblob.cpp:393
int32_t get_total() const
Definition: statistc.h:86
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:119
float y() const
Definition: points.h:211
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:788
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:787
float line_size
Definition: blobbox.h:798
bool textord_noise_debug
Definition: textord.h:397
int32_t x_height() const
return xheight
Definition: ocrblock.h:108
#define ASSERT_HOST(x)
Definition: errcode.h:84
TBOX true_bounding_box() const
Definition: werd.cpp:180