tesseract  5.0.0-alpha-619-ge9db
edgblob.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: edgblob.cpp (Formerly edgeloop.c)
3  * Description: Functions to clean up an outline before approximation.
4  * Author: Ray Smith
5  *
6  *(C) Copyright 1991, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0(the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include "scanedg.h"
20 #include "edgloop.h"
21 #include "edgblob.h"
22 
23 // Include automatically generated configuration file if running autoconf.
24 #ifdef HAVE_CONFIG_H
25 #include "config_auto.h"
26 #endif
27 
28 // Control parameters used in outline_complexity(), which rejects an outline
29 // if any one of the 3 conditions is satisfied:
30 // - number of children exceeds edges_max_children_per_outline
31 // - number of nested layers exceeds edges_max_children_layers
32 // - joint complexity exceeds edges_children_count_limit(as in child_count())
33 static BOOL_VAR(edges_use_new_outline_complexity, false,
34  "Use the new outline complexity module");
35 static INT_VAR(edges_max_children_per_outline, 10,
36  "Max number of children inside a character outline");
37 static INT_VAR(edges_max_children_layers, 5,
38  "Max layers of nested children inside a character outline");
39 static BOOL_VAR(edges_debug, false,
40  "turn on debugging for this module");
41 
42 static INT_VAR(edges_children_per_grandchild, 10,
43  "Importance ratio for chucking outlines");
44 static INT_VAR(edges_children_count_limit, 45,
45  "Max holes allowed in blob");
46 static BOOL_VAR(edges_children_fix, false,
47  "Remove boxy parents of char-like children");
48 static INT_VAR(edges_min_nonhole, 12,
49  "Min pixels for potential char in box");
50 static INT_VAR(edges_patharea_ratio, 40,
51  "Max lensq/area for acceptable child outline");
52 static double_VAR(edges_childarea, 0.5,
53  "Min area fraction of child outline");
54 static double_VAR(edges_boxarea, 0.875,
55  "Min area fraction of grandchild for box");
56 
64 ICOORD bleft, // corners
65 ICOORD tright): bl(bleft), tr(tright) {
66  bxdim =(tright.x() - bleft.x()) / BUCKETSIZE + 1;
67  bydim =(tright.y() - bleft.y()) / BUCKETSIZE + 1;
68  // make array
69  buckets.reset(new C_OUTLINE_LIST[bxdim * bydim]);
70  index = 0;
71 }
72 
73 
81 C_OUTLINE_LIST *
82 OL_BUCKETS::operator()( // array access
83 int16_t x, // image coords
84 int16_t y) {
85  return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE];
86 }
87 
88 
110  C_OUTLINE *outline, // parent outline
111  int32_t max_count, // max output
112  int16_t depth // recurion depth
113  ) {
114  int16_t xmin, xmax; // coord limits
115  int16_t ymin, ymax;
116  int16_t xindex, yindex; // current bucket
117  C_OUTLINE *child; // current child
118  int32_t child_count; // no of children
119  int32_t grandchild_count; // no of grandchildren
120  C_OUTLINE_IT child_it; // search iterator
121 
122  TBOX olbox = outline->bounding_box();
123  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
124  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
125  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
126  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
127  child_count = 0;
128  grandchild_count = 0;
129  if (++depth > edges_max_children_layers) // nested loops are too deep
130  return max_count + depth;
131 
132  for (yindex = ymin; yindex <= ymax; yindex++) {
133  for (xindex = xmin; xindex <= xmax; xindex++) {
134  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
135  if (child_it.empty())
136  continue;
137  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
138  child_it.forward()) {
139  child = child_it.data();
140  if (child == outline || !(*child < *outline))
141  continue;
142  child_count++;
143 
144  if (child_count > edges_max_children_per_outline) { // too fragmented
145  if (edges_debug)
146  tprintf("Discard outline on child_count=%d > "
147  "max_children_per_outline=%d\n",
148  child_count,
149  static_cast<int32_t>(edges_max_children_per_outline));
150  return max_count + child_count;
151  }
152 
153  // Compute the "complexity" of each child recursively
154  int32_t remaining_count = max_count - child_count - grandchild_count;
155  if (remaining_count > 0)
156  grandchild_count += edges_children_per_grandchild *
157  outline_complexity(child, remaining_count, depth);
158  if (child_count + grandchild_count > max_count) { // too complex
159  if (edges_debug)
160  tprintf("Disgard outline on child_count=%d + grandchild_count=%d "
161  "> max_count=%d\n",
162  child_count, grandchild_count, max_count);
163  return child_count + grandchild_count;
164  }
165  }
166  }
167  }
168  return child_count + grandchild_count;
169 }
170 
171 
177 // TODO(rays) Merge with outline_complexity.
178 int32_t OL_BUCKETS::count_children( // recursive count
179  C_OUTLINE *outline, // parent outline
180  int32_t max_count // max output
181  ) {
182  bool parent_box; // could it be boxy
183  int16_t xmin, xmax; // coord limits
184  int16_t ymin, ymax;
185  int16_t xindex, yindex; // current bucket
186  C_OUTLINE *child; // current child
187  int32_t child_count; // no of children
188  int32_t grandchild_count; // no of grandchildren
189  int32_t parent_area; // potential box
190  float max_parent_area; // potential box
191  int32_t child_area; // current child
192  int32_t child_length; // current child
193  TBOX olbox;
194  C_OUTLINE_IT child_it; // search iterator
195 
196  olbox = outline->bounding_box();
197  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
198  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
199  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
200  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
201  child_count = 0;
202  grandchild_count = 0;
203  parent_area = 0;
204  max_parent_area = 0;
205  parent_box = true;
206  for (yindex = ymin; yindex <= ymax; yindex++) {
207  for (xindex = xmin; xindex <= xmax; xindex++) {
208  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
209  if (child_it.empty())
210  continue;
211  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
212  child_it.forward()) {
213  child = child_it.data();
214  if (child != outline && *child < *outline) {
215  child_count++;
216  if (child_count <= max_count) {
217  int max_grand =(max_count - child_count) /
218  edges_children_per_grandchild;
219  if (max_grand > 0)
220  grandchild_count += count_children(child, max_grand) *
221  edges_children_per_grandchild;
222  else
223  grandchild_count += count_children(child, 1);
224  }
225  if (child_count + grandchild_count > max_count) {
226  if (edges_debug)
227  tprintf("Discarding parent with child count=%d, gc=%d\n",
228  child_count,grandchild_count);
229  return child_count + grandchild_count;
230  }
231  if (parent_area == 0) {
232  parent_area = outline->outer_area();
233  if (parent_area < 0)
234  parent_area = -parent_area;
235  max_parent_area = outline->bounding_box().area() * edges_boxarea;
236  if (parent_area < max_parent_area)
237  parent_box = false;
238  }
239  if (parent_box &&
240  (!edges_children_fix ||
241  child->bounding_box().height() > edges_min_nonhole)) {
242  child_area = child->outer_area();
243  if (child_area < 0)
244  child_area = -child_area;
245  if (edges_children_fix) {
246  if (parent_area - child_area < max_parent_area) {
247  parent_box = false;
248  continue;
249  }
250  if (grandchild_count > 0) {
251  if (edges_debug)
252  tprintf("Discarding parent of area %d, child area=%d, max%g "
253  "with gc=%d\n",
254  parent_area, child_area, max_parent_area,
255  grandchild_count);
256  return max_count + 1;
257  }
258  child_length = child->pathlength();
259  if (child_length * child_length >
260  child_area * edges_patharea_ratio) {
261  if (edges_debug)
262  tprintf("Discarding parent of area %d, child area=%d, max%g "
263  "with child length=%d\n",
264  parent_area, child_area, max_parent_area,
265  child_length);
266  return max_count + 1;
267  }
268  }
269  if (child_area < child->bounding_box().area() * edges_childarea) {
270  if (edges_debug)
271  tprintf("Discarding parent of area %d, child area=%d, max%g "
272  "with child rect=%d\n",
273  parent_area, child_area, max_parent_area,
274  child->bounding_box().area());
275  return max_count + 1;
276  }
277  }
278  }
279  }
280  }
281  }
282  return child_count + grandchild_count;
283 }
284 
285 
286 
287 
294 void OL_BUCKETS::extract_children( // recursive count
295  C_OUTLINE *outline, // parent outline
296  C_OUTLINE_IT *it // destination iterator
297  ) {
298  int16_t xmin, xmax; // coord limits
299  int16_t ymin, ymax;
300  int16_t xindex, yindex; // current bucket
301  TBOX olbox;
302  C_OUTLINE_IT child_it; // search iterator
303 
304  olbox = outline->bounding_box();
305  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
306  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
307  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
308  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
309  for (yindex = ymin; yindex <= ymax; yindex++) {
310  for (xindex = xmin; xindex <= xmax; xindex++) {
311  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
312  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
313  child_it.forward()) {
314  if (*child_it.data() < *outline) {
315  it->add_after_then_move(child_it.extract());
316  }
317  }
318  }
319  }
320 }
321 
322 
329 void extract_edges(Pix* pix, // thresholded image
330  BLOCK *block) { // block to scan
331  C_OUTLINE_LIST outlines; // outlines in block
332  C_OUTLINE_IT out_it = &outlines;
333 
334  block_edges(pix, &(block->pdblk), &out_it);
335  ICOORD bleft; // block box
336  ICOORD tright;
337  block->pdblk.bounding_box(bleft, tright);
338  // make blobs
339  outlines_to_blobs(block, bleft, tright, &outlines);
340 }
341 
342 
349 void outlines_to_blobs( // find blobs
350  BLOCK *block, // block to scan
351  ICOORD bleft,
352  ICOORD tright,
353  C_OUTLINE_LIST *outlines) {
354  // make buckets
355  OL_BUCKETS buckets(bleft, tright);
356 
357  fill_buckets(outlines, &buckets);
358  empty_buckets(block, &buckets);
359 }
360 
361 
368 void fill_buckets( // find blobs
369  C_OUTLINE_LIST *outlines, // outlines in block
370  OL_BUCKETS *buckets // output buckets
371  ) {
372  TBOX ol_box; // outline box
373  C_OUTLINE_IT out_it = outlines; // iterator
374  C_OUTLINE_IT bucket_it; // iterator in bucket
375  C_OUTLINE *outline; // current outline
376 
377  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
378  outline = out_it.extract(); // take off list
379  // get box
380  ol_box = outline->bounding_box();
381  bucket_it.set_to_list((*buckets) (ol_box.left(), ol_box.bottom()));
382  bucket_it.add_to_end(outline);
383  }
384 }
385 
386 
393 void empty_buckets( // find blobs
394  BLOCK *block, // block to scan
395  OL_BUCKETS *buckets // output buckets
396  ) {
397  bool good_blob; // healthy blob
398  C_OUTLINE_LIST outlines; // outlines in block
399  // iterator
400  C_OUTLINE_IT out_it = &outlines;
401  C_OUTLINE_IT bucket_it = buckets->start_scan();
402  C_OUTLINE_IT parent_it; // parent outline
403  C_BLOB_IT good_blobs = block->blob_list();
404  C_BLOB_IT junk_blobs = block->reject_blobs();
405 
406  while (!bucket_it.empty()) {
407  out_it.set_to_list(&outlines);
408  do {
409  parent_it = bucket_it; // find outermost
410  do {
411  bucket_it.forward();
412  } while (!bucket_it.at_first() &&
413  !(*parent_it.data() < *bucket_it.data()));
414  } while (!bucket_it.at_first());
415 
416  // move to new list
417  out_it.add_after_then_move(parent_it.extract());
418  good_blob = capture_children(buckets, &junk_blobs, &out_it);
419  C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs,
420  &junk_blobs);
421 
422  bucket_it.set_to_list(buckets->scan_next());
423  }
424 }
425 
426 
435 bool capture_children( // find children
436  OL_BUCKETS* buckets, // bucket sort clanss
437  C_BLOB_IT* reject_it, // dead grandchildren
438  C_OUTLINE_IT* blob_it // output outlines
439 ) {
440  C_OUTLINE *outline; // master outline
441  int32_t child_count; // no of children
442 
443  outline = blob_it->data();
444  if (edges_use_new_outline_complexity)
445  child_count = buckets->outline_complexity(outline,
446  edges_children_count_limit,
447  0);
448  else
449  child_count = buckets->count_children(outline,
450  edges_children_count_limit);
451  if (child_count > edges_children_count_limit)
452  return false;
453 
454  if (child_count > 0)
455  buckets->extract_children(outline, blob_it);
456  return true;
457 }
INT_VAR
#define INT_VAR(name, val, comment)
Definition: params.h:300
PDBLK::bounding_box
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:58
ICOORD
integer coordinate
Definition: points.h:30
TBOX::top
int16_t top() const
Definition: rect.h:57
TBOX::area
int32_t area() const
Definition: rect.h:121
C_BLOB::ConstructBlobsFromOutlines
static void ConstructBlobsFromOutlines(bool good_blob, C_OUTLINE_LIST *outline_list, C_BLOB_IT *good_blobs_it, C_BLOB_IT *bad_blobs_it)
Definition: stepblob.cpp:184
OL_BUCKETS
Definition: edgblob.h:31
OL_BUCKETS::outline_complexity
int32_t outline_complexity(C_OUTLINE *outline, int32_t max_count, int16_t depth)
Definition: edgblob.cpp:109
capture_children
bool capture_children(OL_BUCKETS *buckets, C_BLOB_IT *reject_it, C_OUTLINE_IT *blob_it)
Definition: edgblob.cpp:435
ICOORD::x
int16_t x() const
access function
Definition: points.h:51
C_OUTLINE::outer_area
int32_t outer_area() const
Definition: coutln.cpp:308
C_OUTLINE
Definition: coutln.h:71
TBOX::height
int16_t height() const
Definition: rect.h:107
OL_BUCKETS::operator()
C_OUTLINE_LIST * operator()(int16_t x, int16_t y)
Definition: edgblob.cpp:82
scanedg.h
extract_edges
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:329
BLOCK
Definition: ocrblock.h:28
BLOCK::pdblk
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:189
edgblob.h
BOOL_VAR
#define BOOL_VAR(name, val, comment)
Definition: params.h:303
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
OL_BUCKETS::count_children
int32_t count_children(C_OUTLINE *outline, int32_t max_count)
Definition: edgblob.cpp:178
fill_buckets
void fill_buckets(C_OUTLINE_LIST *outlines, OL_BUCKETS *buckets)
Definition: edgblob.cpp:368
BUCKETSIZE
#define BUCKETSIZE
Definition: edgblob.h:29
double_VAR
#define double_VAR(name, val, comment)
Definition: params.h:309
block_edges
void block_edges(Pix *t_pix, PDBLK *block, C_OUTLINE_IT *outline_it)
Definition: scanedg.cpp:35
OL_BUCKETS::start_scan
C_OUTLINE_LIST * start_scan()
Definition: edgblob.h:44
outlines_to_blobs
void outlines_to_blobs(BLOCK *block, ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines)
Definition: edgblob.cpp:349
OL_BUCKETS::scan_next
C_OUTLINE_LIST * scan_next()
Definition: edgblob.h:50
TBOX::left
int16_t left() const
Definition: rect.h:71
empty_buckets
void empty_buckets(BLOCK *block, OL_BUCKETS *buckets)
Definition: edgblob.cpp:393
OL_BUCKETS::OL_BUCKETS
OL_BUCKETS(ICOORD bleft, ICOORD tright)
Definition: edgblob.cpp:63
BLOCK::blob_list
C_BLOB_LIST * blob_list()
get blobs
Definition: ocrblock.h:127
C_OUTLINE::bounding_box
const TBOX & bounding_box() const
Definition: coutln.h:112
TBOX::right
int16_t right() const
Definition: rect.h:78
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
C_OUTLINE::pathlength
int32_t pathlength() const
Definition: coutln.h:134
OL_BUCKETS::extract_children
void extract_children(C_OUTLINE *outline, C_OUTLINE_IT *it)
Definition: edgblob.cpp:294
BLOCK::reject_blobs
C_BLOB_LIST * reject_blobs()
Definition: ocrblock.h:130
edgloop.h
ICOORD::y
int16_t y() const
access_function
Definition: points.h:55
TBOX
Definition: rect.h:33