tesseract  4.0.0-1-g2a2b
edgblob.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: edgblob.cpp (Formerly edgeloop.c)
3  * Description: Functions to clean up an outline before approximation.
4  * Author: Ray Smith
5  * Created: Tue Mar 26 16:56:25 GMT 1991
6  *
7  *(C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0(the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "scanedg.h"
21 #include "drawedg.h"
22 #include "edgloop.h"
23 #include "edgblob.h"
24 
25 // Include automatically generated configuration file if running autoconf.
26 #ifdef HAVE_CONFIG_H
27 #include "config_auto.h"
28 #endif
29 
30 #define EXTERN
31 
32 // Control parameters used in outline_complexity(), which rejects an outline
33 // if any one of the 3 conditions is satisfied:
34 // - number of children exceeds edges_max_children_per_outline
35 // - number of nested layers exceeds edges_max_children_layers
36 // - joint complexity exceeds edges_children_count_limit(as in child_count())
38  "Use the new outline complexity module");
40  "Max number of children inside a character outline");
42  "Max layers of nested children inside a character outline");
44  "turn on debugging for this module");
45 
46 
48  "Importance ratio for chucking outlines");
50  "Max holes allowed in blob");
52  "Remove boxy parents of char-like children");
54  "Min pixels for potential char in box");
56  "Max lensq/area for acceptable child outline");
58  "Min area fraction of child outline");
60  "Min area fraction of grandchild for box");
61 
69 ICOORD bleft, // corners
70 ICOORD tright): bl(bleft), tr(tright) {
71  bxdim =(tright.x() - bleft.x()) / BUCKETSIZE + 1;
72  bydim =(tright.y() - bleft.y()) / BUCKETSIZE + 1;
73  // make array
74  buckets.reset(new C_OUTLINE_LIST[bxdim * bydim]);
75  index = 0;
76 }
77 
78 
86 C_OUTLINE_LIST *
87 OL_BUCKETS::operator()( // array access
88 int16_t x, // image coords
89 int16_t y) {
90  return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE];
91 }
92 
93 
115  C_OUTLINE *outline, // parent outline
116  int32_t max_count, // max output
117  int16_t depth // recurion depth
118  ) {
119  int16_t xmin, xmax; // coord limits
120  int16_t ymin, ymax;
121  int16_t xindex, yindex; // current bucket
122  C_OUTLINE *child; // current child
123  int32_t child_count; // no of children
124  int32_t grandchild_count; // no of grandchildren
125  C_OUTLINE_IT child_it; // search iterator
126 
127  TBOX olbox = outline->bounding_box();
128  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
129  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
130  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
131  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
132  child_count = 0;
133  grandchild_count = 0;
134  if (++depth > edges_max_children_layers) // nested loops are too deep
135  return max_count + depth;
136 
137  for (yindex = ymin; yindex <= ymax; yindex++) {
138  for (xindex = xmin; xindex <= xmax; xindex++) {
139  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
140  if (child_it.empty())
141  continue;
142  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
143  child_it.forward()) {
144  child = child_it.data();
145  if (child == outline || !(*child < *outline))
146  continue;
147  child_count++;
148 
149  if (child_count > edges_max_children_per_outline) { // too fragmented
150  if (edges_debug)
151  tprintf("Discard outline on child_count=%d > "
152  "max_children_per_outline=%d\n",
153  child_count,
154  static_cast<int32_t>(edges_max_children_per_outline));
155  return max_count + child_count;
156  }
157 
158  // Compute the "complexity" of each child recursively
159  int32_t remaining_count = max_count - child_count - grandchild_count;
160  if (remaining_count > 0)
161  grandchild_count += edges_children_per_grandchild *
162  outline_complexity(child, remaining_count, depth);
163  if (child_count + grandchild_count > max_count) { // too complex
164  if (edges_debug)
165  tprintf("Disgard outline on child_count=%d + grandchild_count=%d "
166  "> max_count=%d\n",
167  child_count, grandchild_count, max_count);
168  return child_count + grandchild_count;
169  }
170  }
171  }
172  }
173  return child_count + grandchild_count;
174 }
175 
176 
182 // TODO(rays) Merge with outline_complexity.
183 int32_t OL_BUCKETS::count_children( // recursive count
184  C_OUTLINE *outline, // parent outline
185  int32_t max_count // max output
186  ) {
187  bool parent_box; // could it be boxy
188  int16_t xmin, xmax; // coord limits
189  int16_t ymin, ymax;
190  int16_t xindex, yindex; // current bucket
191  C_OUTLINE *child; // current child
192  int32_t child_count; // no of children
193  int32_t grandchild_count; // no of grandchildren
194  int32_t parent_area; // potential box
195  float max_parent_area; // potential box
196  int32_t child_area; // current child
197  int32_t child_length; // current child
198  TBOX olbox;
199  C_OUTLINE_IT child_it; // search iterator
200 
201  olbox = outline->bounding_box();
202  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
203  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
204  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
205  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
206  child_count = 0;
207  grandchild_count = 0;
208  parent_area = 0;
209  max_parent_area = 0;
210  parent_box = true;
211  for (yindex = ymin; yindex <= ymax; yindex++) {
212  for (xindex = xmin; xindex <= xmax; xindex++) {
213  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
214  if (child_it.empty())
215  continue;
216  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
217  child_it.forward()) {
218  child = child_it.data();
219  if (child != outline && *child < *outline) {
220  child_count++;
221  if (child_count <= max_count) {
222  int max_grand =(max_count - child_count) /
224  if (max_grand > 0)
225  grandchild_count += count_children(child, max_grand) *
227  else
228  grandchild_count += count_children(child, 1);
229  }
230  if (child_count + grandchild_count > max_count) {
231  if (edges_debug)
232  tprintf("Discarding parent with child count=%d, gc=%d\n",
233  child_count,grandchild_count);
234  return child_count + grandchild_count;
235  }
236  if (parent_area == 0) {
237  parent_area = outline->outer_area();
238  if (parent_area < 0)
239  parent_area = -parent_area;
240  max_parent_area = outline->bounding_box().area() * edges_boxarea;
241  if (parent_area < max_parent_area)
242  parent_box = false;
243  }
244  if (parent_box &&
245  (!edges_children_fix ||
246  child->bounding_box().height() > edges_min_nonhole)) {
247  child_area = child->outer_area();
248  if (child_area < 0)
249  child_area = -child_area;
250  if (edges_children_fix) {
251  if (parent_area - child_area < max_parent_area) {
252  parent_box = false;
253  continue;
254  }
255  if (grandchild_count > 0) {
256  if (edges_debug)
257  tprintf("Discarding parent of area %d, child area=%d, max%g "
258  "with gc=%d\n",
259  parent_area, child_area, max_parent_area,
260  grandchild_count);
261  return max_count + 1;
262  }
263  child_length = child->pathlength();
264  if (child_length * child_length >
265  child_area * edges_patharea_ratio) {
266  if (edges_debug)
267  tprintf("Discarding parent of area %d, child area=%d, max%g "
268  "with child length=%d\n",
269  parent_area, child_area, max_parent_area,
270  child_length);
271  return max_count + 1;
272  }
273  }
274  if (child_area < child->bounding_box().area() * edges_childarea) {
275  if (edges_debug)
276  tprintf("Discarding parent of area %d, child area=%d, max%g "
277  "with child rect=%d\n",
278  parent_area, child_area, max_parent_area,
279  child->bounding_box().area());
280  return max_count + 1;
281  }
282  }
283  }
284  }
285  }
286  }
287  return child_count + grandchild_count;
288 }
289 
290 
291 
292 
299 void OL_BUCKETS::extract_children( // recursive count
300  C_OUTLINE *outline, // parent outline
301  C_OUTLINE_IT *it // destination iterator
302  ) {
303  int16_t xmin, xmax; // coord limits
304  int16_t ymin, ymax;
305  int16_t xindex, yindex; // current bucket
306  TBOX olbox;
307  C_OUTLINE_IT child_it; // search iterator
308 
309  olbox = outline->bounding_box();
310  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
311  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
312  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
313  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
314  for (yindex = ymin; yindex <= ymax; yindex++) {
315  for (xindex = xmin; xindex <= xmax; xindex++) {
316  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
317  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
318  child_it.forward()) {
319  if (*child_it.data() < *outline) {
320  it->add_after_then_move(child_it.extract());
321  }
322  }
323  }
324  }
325 }
326 
327 
334 void extract_edges(Pix* pix, // thresholded image
335  BLOCK *block) { // block to scan
336  C_OUTLINE_LIST outlines; // outlines in block
337  C_OUTLINE_IT out_it = &outlines;
338 
339  block_edges(pix, &(block->pdblk), &out_it);
340  ICOORD bleft; // block box
341  ICOORD tright;
342  block->pdblk.bounding_box(bleft, tright);
343  // make blobs
344  outlines_to_blobs(block, bleft, tright, &outlines);
345 }
346 
347 
354 void outlines_to_blobs( // find blobs
355  BLOCK *block, // block to scan
356  ICOORD bleft,
357  ICOORD tright,
358  C_OUTLINE_LIST *outlines) {
359  // make buckets
360  OL_BUCKETS buckets(bleft, tright);
361 
362  fill_buckets(outlines, &buckets);
363  empty_buckets(block, &buckets);
364 }
365 
366 
373 void fill_buckets( // find blobs
374  C_OUTLINE_LIST *outlines, // outlines in block
375  OL_BUCKETS *buckets // output buckets
376  ) {
377  TBOX ol_box; // outline box
378  C_OUTLINE_IT out_it = outlines; // iterator
379  C_OUTLINE_IT bucket_it; // iterator in bucket
380  C_OUTLINE *outline; // current outline
381 
382  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
383  outline = out_it.extract(); // take off list
384  // get box
385  ol_box = outline->bounding_box();
386  bucket_it.set_to_list((*buckets) (ol_box.left(), ol_box.bottom()));
387  bucket_it.add_to_end(outline);
388  }
389 }
390 
391 
398 void empty_buckets( // find blobs
399  BLOCK *block, // block to scan
400  OL_BUCKETS *buckets // output buckets
401  ) {
402  bool good_blob; // healthy blob
403  C_OUTLINE_LIST outlines; // outlines in block
404  // iterator
405  C_OUTLINE_IT out_it = &outlines;
406  C_OUTLINE_IT bucket_it = buckets->start_scan();
407  C_OUTLINE_IT parent_it; // parent outline
408  C_BLOB_IT good_blobs = block->blob_list();
409  C_BLOB_IT junk_blobs = block->reject_blobs();
410 
411  while (!bucket_it.empty()) {
412  out_it.set_to_list(&outlines);
413  do {
414  parent_it = bucket_it; // find outermost
415  do {
416  bucket_it.forward();
417  } while (!bucket_it.at_first() &&
418  !(*parent_it.data() < *bucket_it.data()));
419  } while (!bucket_it.at_first());
420 
421  // move to new list
422  out_it.add_after_then_move(parent_it.extract());
423  good_blob = capture_children(buckets, &junk_blobs, &out_it);
424  C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs,
425  &junk_blobs);
426 
427  bucket_it.set_to_list(buckets->scan_next());
428  }
429 }
430 
431 
440 bool capture_children( // find children
441  OL_BUCKETS* buckets, // bucket sort clanss
442  C_BLOB_IT* reject_it, // dead grandchildren
443  C_OUTLINE_IT* blob_it // output outlines
444 ) {
445  C_OUTLINE *outline; // master outline
446  int32_t child_count; // no of children
447 
448  outline = blob_it->data();
450  child_count = buckets->outline_complexity(outline,
452  0);
453  else
454  child_count = buckets->count_children(outline,
456  if (child_count > edges_children_count_limit)
457  return false;
458 
459  if (child_count > 0)
460  buckets->extract_children(outline, blob_it);
461  return true;
462 }
EXTERN bool edges_children_fix
Definition: edgblob.cpp:52
C_OUTLINE_LIST * operator()(int16_t x, int16_t y)
Definition: edgblob.cpp:87
EXTERN int edges_max_children_layers
Definition: edgblob.cpp:42
#define BOOL_VAR(name, val, comment)
Definition: params.h:279
void fill_buckets(C_OUTLINE_LIST *outlines, OL_BUCKETS *buckets)
Definition: edgblob.cpp:373
EXTERN bool edges_debug
Definition: edgblob.cpp:44
#define double_VAR(name, val, comment)
Definition: params.h:285
C_BLOB_LIST * reject_blobs()
Definition: ocrblock.h:133
int32_t count_children(C_OUTLINE *outline, int32_t max_count)
Definition: edgblob.cpp:183
int16_t y() const
access_function
Definition: points.h:57
void outlines_to_blobs(BLOCK *block, ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines)
Definition: edgblob.cpp:354
Definition: rect.h:34
EXTERN int edges_children_count_limit
Definition: edgblob.cpp:50
C_BLOB_LIST * blob_list()
get blobs
Definition: ocrblock.h:130
void extract_children(C_OUTLINE *outline, C_OUTLINE_IT *it)
Definition: edgblob.cpp:299
EXTERN bool edges_use_new_outline_complexity
Definition: edgblob.cpp:38
int16_t left() const
Definition: rect.h:72
EXTERN double edges_boxarea
Definition: edgblob.cpp:60
int16_t top() const
Definition: rect.h:58
void empty_buckets(BLOCK *block, OL_BUCKETS *buckets)
Definition: edgblob.cpp:398
integer coordinate
Definition: points.h:32
int16_t x() const
access function
Definition: points.h:53
#define FALSE
Definition: capi.h:52
#define EXTERN
Definition: edgblob.cpp:30
bool capture_children(OL_BUCKETS *buckets, C_BLOB_IT *reject_it, C_OUTLINE_IT *blob_it)
Definition: edgblob.cpp:440
EXTERN int edges_max_children_per_outline
Definition: edgblob.cpp:40
int32_t pathlength() const
Definition: coutln.h:135
#define BUCKETSIZE
Definition: edgblob.h:30
const TBOX & bounding_box() const
Definition: coutln.h:113
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: ocrblock.h:30
EXTERN int edges_min_nonhole
Definition: edgblob.cpp:54
int32_t area() const
Definition: rect.h:122
EXTERN double edges_childarea
Definition: edgblob.cpp:58
int32_t outline_complexity(C_OUTLINE *outline, int32_t max_count, int16_t depth)
Definition: edgblob.cpp:114
C_OUTLINE_LIST * scan_next()
Definition: edgblob.h:51
static void ConstructBlobsFromOutlines(bool good_blob, C_OUTLINE_LIST *outline_list, C_BLOB_IT *good_blobs_it, C_BLOB_IT *bad_blobs_it)
Definition: stepblob.cpp:191
EXTERN int edges_children_per_grandchild
Definition: edgblob.cpp:48
C_OUTLINE_LIST * start_scan()
Definition: edgblob.h:45
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:60
OL_BUCKETS(ICOORD bleft, ICOORD tright)
Definition: edgblob.cpp:68
int16_t right() const
Definition: rect.h:79
EXTERN int edges_patharea_ratio
Definition: edgblob.cpp:56
int16_t bottom() const
Definition: rect.h:65
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:334
PDBLK pdblk
Definition: ocrblock.h:192
int32_t outer_area() const
Definition: coutln.cpp:309
int16_t height() const
Definition: rect.h:108
void block_edges(Pix *t_pix, PDBLK *block, C_OUTLINE_IT *outline_it)
Definition: scanedg.cpp:37
#define INT_VAR(name, val, comment)
Definition: params.h:276