tesseract  5.0.0-alpha-619-ge9db
imagefind.cpp
Go to the documentation of this file.
1 // File: imagefind.cpp
3 // Description: Function to find image and drawing regions in an image
4 // and create a corresponding list of empty blobs.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include "imagefind.h"
25 #include "colpartitiongrid.h"
26 #include "linlsq.h"
27 #include "statistc.h"
28 #include "params.h"
29 
30 #include "allheaders.h"
31 
32 #include <algorithm>
33 
34 static INT_VAR(textord_tabfind_show_images, false, "Show image blobs");
35 
36 namespace tesseract {
37 
38 // Fraction of width or height of on pixels that can be discarded from a
39 // roughly rectangular image.
40 const double kMinRectangularFraction = 0.125;
41 // Fraction of width or height to consider image completely used.
42 const double kMaxRectangularFraction = 0.75;
43 // Fraction of width or height to allow transition from kMinRectangularFraction
44 // to kMaxRectangularFraction, equivalent to a dy/dx skew.
45 const double kMaxRectangularGradient = 0.1; // About 6 degrees.
46 // Minimum image size to be worth looking for images on.
47 const int kMinImageFindSize = 100;
48 // Scale factor for the rms color fit error.
49 const double kRMSFitScaling = 8.0;
50 // Min color difference to call it two colors.
51 const int kMinColorDifference = 16;
52 // Pixel padding for noise blobs and partitions when rendering on the image
53 // mask to encourage them to join together. Make it too big and images
54 // will fatten out too much and have to be clipped to text.
55 const int kNoisePadding = 4;
56 
57 // Finds image regions within the BINARY source pix (page image) and returns
58 // the image regions as a mask image.
59 // The returned pix may be nullptr, meaning no images found.
60 // If not nullptr, it must be PixDestroyed by the caller.
61 // If textord_tabfind_show_images, debug images are appended to pixa_debug.
62 Pix* ImageFind::FindImages(Pix* pix, DebugPixa* pixa_debug) {
63  // Not worth looking at small images.
64  if (pixGetWidth(pix) < kMinImageFindSize ||
65  pixGetHeight(pix) < kMinImageFindSize)
66  return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
67 
68  // Reduce by factor 2.
69  Pix *pixr = pixReduceRankBinaryCascade(pix, 1, 0, 0, 0);
70  if (textord_tabfind_show_images && pixa_debug != nullptr)
71  pixa_debug->AddPix(pixr, "CascadeReduced");
72 
73  // Get the halftone mask directly from Leptonica.
74  //
75  // Leptonica will print an error message and return nullptr if we call
76  // pixGenHalftoneMask(pixr, nullptr, ...) with too small image, so we
77  // want to bypass that.
78  if (pixGetWidth(pixr) < kMinImageFindSize ||
79  pixGetHeight(pixr) < kMinImageFindSize) {
80  pixDestroy(&pixr);
81  return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
82  }
83  // Get the halftone mask.
84  l_int32 ht_found = 0;
85  Pixa* pixadb = (textord_tabfind_show_images && pixa_debug != nullptr)
86  ? pixaCreate(0)
87  : nullptr;
88  Pix* pixht2 = pixGenerateHalftoneMask(pixr, nullptr, &ht_found, pixadb);
89  if (pixadb) {
90  Pix* pixdb = pixaDisplayTiledInColumns(pixadb, 3, 1.0, 20, 2);
91  if (textord_tabfind_show_images && pixa_debug != nullptr)
92  pixa_debug->AddPix(pixdb, "HalftoneMask");
93  pixDestroy(&pixdb);
94  pixaDestroy(&pixadb);
95  }
96  pixDestroy(&pixr);
97  if (!ht_found && pixht2 != nullptr)
98  pixDestroy(&pixht2);
99  if (pixht2 == nullptr)
100  return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
101 
102  // Expand back up again.
103  Pix *pixht = pixExpandReplicate(pixht2, 2);
104  if (textord_tabfind_show_images && pixa_debug != nullptr)
105  pixa_debug->AddPix(pixht, "HalftoneReplicated");
106  pixDestroy(&pixht2);
107 
108  // Fill to capture pixels near the mask edges that were missed
109  Pix *pixt = pixSeedfillBinary(nullptr, pixht, pix, 8);
110  pixOr(pixht, pixht, pixt);
111  pixDestroy(&pixt);
112 
113  // Eliminate lines and bars that may be joined to images.
114  Pix* pixfinemask = pixReduceRankBinaryCascade(pixht, 1, 1, 3, 3);
115  pixDilateBrick(pixfinemask, pixfinemask, 5, 5);
116  if (textord_tabfind_show_images && pixa_debug != nullptr)
117  pixa_debug->AddPix(pixfinemask, "FineMask");
118  Pix* pixreduced = pixReduceRankBinaryCascade(pixht, 1, 1, 1, 1);
119  Pix* pixreduced2 = pixReduceRankBinaryCascade(pixreduced, 3, 3, 3, 0);
120  pixDestroy(&pixreduced);
121  pixDilateBrick(pixreduced2, pixreduced2, 5, 5);
122  Pix* pixcoarsemask = pixExpandReplicate(pixreduced2, 8);
123  pixDestroy(&pixreduced2);
124  if (textord_tabfind_show_images && pixa_debug != nullptr)
125  pixa_debug->AddPix(pixcoarsemask, "CoarseMask");
126  // Combine the coarse and fine image masks.
127  pixAnd(pixcoarsemask, pixcoarsemask, pixfinemask);
128  pixDestroy(&pixfinemask);
129  // Dilate a bit to make sure we get everything.
130  pixDilateBrick(pixcoarsemask, pixcoarsemask, 3, 3);
131  Pix* pixmask = pixExpandReplicate(pixcoarsemask, 16);
132  pixDestroy(&pixcoarsemask);
133  if (textord_tabfind_show_images && pixa_debug != nullptr)
134  pixa_debug->AddPix(pixmask, "MaskDilated");
135  // And the image mask with the line and bar remover.
136  pixAnd(pixht, pixht, pixmask);
137  pixDestroy(&pixmask);
138  if (textord_tabfind_show_images && pixa_debug != nullptr)
139  pixa_debug->AddPix(pixht, "FinalMask");
140  // Make the result image the same size as the input.
141  Pix* result = pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
142  pixOr(result, result, pixht);
143  pixDestroy(&pixht);
144  return result;
145 }
146 
147 // Generates a Boxa, Pixa pair from the input binary (image mask) pix,
148 // analogous to pixConnComp, except that connected components which are nearly
149 // rectangular are replaced with solid rectangles.
150 // The returned boxa, pixa may be nullptr, meaning no images found.
151 // If not nullptr, they must be destroyed by the caller.
152 // Resolution of pix should match the source image (Tesseract::pix_binary_)
153 // so the output coordinate systems match.
155  Boxa** boxa, Pixa** pixa) {
156  *boxa = nullptr;
157  *pixa = nullptr;
158 
159  if (textord_tabfind_show_images && pixa_debug != nullptr)
160  pixa_debug->AddPix(pix, "Conncompimage");
161  // Find the individual image regions in the mask image.
162  *boxa = pixConnComp(pix, pixa, 8);
163  // Rectangularize the individual images. If a sharp edge in vertical and/or
164  // horizontal occupancy can be found, it indicates a probably rectangular
165  // image with unwanted bits merged on, so clip to the approximate rectangle.
166  int npixes = 0;
167  if (*boxa != nullptr && *pixa != nullptr) npixes = pixaGetCount(*pixa);
168  for (int i = 0; i < npixes; ++i) {
169  int x_start, x_end, y_start, y_end;
170  Pix* img_pix = pixaGetPix(*pixa, i, L_CLONE);
171  if (textord_tabfind_show_images && pixa_debug != nullptr)
172  pixa_debug->AddPix(img_pix, "A component");
176  &x_start, &y_start, &x_end, &y_end)) {
177  Pix* simple_pix = pixCreate(x_end - x_start, y_end - y_start, 1);
178  pixSetAll(simple_pix);
179  pixDestroy(&img_pix);
180  // pixaReplacePix takes ownership of the simple_pix.
181  pixaReplacePix(*pixa, i, simple_pix, nullptr);
182  img_pix = pixaGetPix(*pixa, i, L_CLONE);
183  // Fix the box to match the new pix.
184  l_int32 x, y, width, height;
185  boxaGetBoxGeometry(*boxa, i, &x, &y, &width, &height);
186  Box* simple_box = boxCreate(x + x_start, y + y_start,
187  x_end - x_start, y_end - y_start);
188  boxaReplaceBox(*boxa, i, simple_box);
189  }
190  pixDestroy(&img_pix);
191  }
192 }
193 
194 // Scans horizontally on x=[x_start,x_end), starting with y=*y_start,
195 // stepping y+=y_step, until y=y_end. *ystart is input/output.
196 // If the number of black pixels in a row, pix_count fits this pattern:
197 // 0 or more rows with pix_count < min_count then
198 // <= mid_width rows with min_count <= pix_count <= max_count then
199 // a row with pix_count > max_count then
200 // true is returned, and *y_start = the first y with pix_count >= min_count.
201 static bool HScanForEdge(uint32_t* data, int wpl, int x_start, int x_end,
202  int min_count, int mid_width, int max_count,
203  int y_end, int y_step, int* y_start) {
204  int mid_rows = 0;
205  for (int y = *y_start; y != y_end; y += y_step) {
206  // Need pixCountPixelsInRow(pix, y, &pix_count, nullptr) to count in a subset.
207  int pix_count = 0;
208  uint32_t* line = data + wpl * y;
209  for (int x = x_start; x < x_end; ++x) {
210  if (GET_DATA_BIT(line, x))
211  ++pix_count;
212  }
213  if (mid_rows == 0 && pix_count < min_count)
214  continue; // In the min phase.
215  if (mid_rows == 0)
216  *y_start = y; // Save the y_start where we came out of the min phase.
217  if (pix_count > max_count)
218  return true; // Found the pattern.
219  ++mid_rows;
220  if (mid_rows > mid_width)
221  break; // Middle too big.
222  }
223  return false; // Never found max_count.
224 }
225 
226 // Scans vertically on y=[y_start,y_end), starting with x=*x_start,
227 // stepping x+=x_step, until x=x_end. *x_start is input/output.
228 // If the number of black pixels in a column, pix_count fits this pattern:
229 // 0 or more cols with pix_count < min_count then
230 // <= mid_width cols with min_count <= pix_count <= max_count then
231 // a column with pix_count > max_count then
232 // true is returned, and *x_start = the first x with pix_count >= min_count.
233 static bool VScanForEdge(uint32_t* data, int wpl, int y_start, int y_end,
234  int min_count, int mid_width, int max_count,
235  int x_end, int x_step, int* x_start) {
236  int mid_cols = 0;
237  for (int x = *x_start; x != x_end; x += x_step) {
238  int pix_count = 0;
239  uint32_t* line = data + y_start * wpl;
240  for (int y = y_start; y < y_end; ++y, line += wpl) {
241  if (GET_DATA_BIT(line, x))
242  ++pix_count;
243  }
244  if (mid_cols == 0 && pix_count < min_count)
245  continue; // In the min phase.
246  if (mid_cols == 0)
247  *x_start = x; // Save the place where we came out of the min phase.
248  if (pix_count > max_count)
249  return true; // found the pattern.
250  ++mid_cols;
251  if (mid_cols > mid_width)
252  break; // Middle too big.
253  }
254  return false; // Never found max_count.
255 }
256 
257 // Returns true if there is a rectangle in the source pix, such that all
258 // pixel rows and column slices outside of it have less than
259 // min_fraction of the pixels black, and within max_skew_gradient fraction
260 // of the pixels on the inside, there are at least max_fraction of the
261 // pixels black. In other words, the inside of the rectangle looks roughly
262 // rectangular, and the outside of it looks like extra bits.
263 // On return, the rectangle is defined by x_start, y_start, x_end and y_end.
264 // Note: the algorithm is iterative, allowing it to slice off pixels from
265 // one edge, allowing it to then slice off more pixels from another edge.
267  double min_fraction, double max_fraction,
268  double max_skew_gradient,
269  int* x_start, int* y_start,
270  int* x_end, int* y_end) {
271  ASSERT_HOST(pix != nullptr);
272  *x_start = 0;
273  *x_end = pixGetWidth(pix);
274  *y_start = 0;
275  *y_end = pixGetHeight(pix);
276 
277  uint32_t* data = pixGetData(pix);
278  int wpl = pixGetWpl(pix);
279  bool any_cut = false;
280  bool left_done = false;
281  bool right_done = false;
282  bool top_done = false;
283  bool bottom_done = false;
284  do {
285  any_cut = false;
286  // Find the top/bottom edges.
287  int width = *x_end - *x_start;
288  int min_count = static_cast<int>(width * min_fraction);
289  int max_count = static_cast<int>(width * max_fraction);
290  int edge_width = static_cast<int>(width * max_skew_gradient);
291  if (HScanForEdge(data, wpl, *x_start, *x_end, min_count, edge_width,
292  max_count, *y_end, 1, y_start) && !top_done) {
293  top_done = true;
294  any_cut = true;
295  }
296  --(*y_end);
297  if (HScanForEdge(data, wpl, *x_start, *x_end, min_count, edge_width,
298  max_count, *y_start, -1, y_end) && !bottom_done) {
299  bottom_done = true;
300  any_cut = true;
301  }
302  ++(*y_end);
303 
304  // Find the left/right edges.
305  int height = *y_end - *y_start;
306  min_count = static_cast<int>(height * min_fraction);
307  max_count = static_cast<int>(height * max_fraction);
308  edge_width = static_cast<int>(height * max_skew_gradient);
309  if (VScanForEdge(data, wpl, *y_start, *y_end, min_count, edge_width,
310  max_count, *x_end, 1, x_start) && !left_done) {
311  left_done = true;
312  any_cut = true;
313  }
314  --(*x_end);
315  if (VScanForEdge(data, wpl, *y_start, *y_end, min_count, edge_width,
316  max_count, *x_start, -1, x_end) && !right_done) {
317  right_done = true;
318  any_cut = true;
319  }
320  ++(*x_end);
321  } while (any_cut);
322 
323  // All edges must satisfy the condition of sharp gradient in pixel density
324  // in order for the full rectangle to be present.
325  return left_done && right_done && top_done && bottom_done;
326 }
327 
328 // Given an input pix, and a bounding rectangle, the sides of the rectangle
329 // are shrunk inwards until they bound any black pixels found within the
330 // original rectangle. Returns false if the rectangle contains no black
331 // pixels at all.
332 bool ImageFind::BoundsWithinRect(Pix* pix, int* x_start, int* y_start,
333  int* x_end, int* y_end) {
334  Box* input_box = boxCreate(*x_start, *y_start, *x_end - *x_start,
335  *y_end - *y_start);
336  Box* output_box = nullptr;
337  pixClipBoxToForeground(pix, input_box, nullptr, &output_box);
338  bool result = output_box != nullptr;
339  if (result) {
340  l_int32 x, y, width, height;
341  boxGetGeometry(output_box, &x, &y, &width, &height);
342  *x_start = x;
343  *y_start = y;
344  *x_end = x + width;
345  *y_end = y + height;
346  boxDestroy(&output_box);
347  }
348  boxDestroy(&input_box);
349  return result;
350 }
351 
352 // Given a point in 3-D (RGB) space, returns the squared Euclidean distance
353 // of the point from the given line, defined by a pair of points in the 3-D
354 // (RGB) space, line1 and line2.
355 double ImageFind::ColorDistanceFromLine(const uint8_t* line1,
356  const uint8_t* line2,
357  const uint8_t* point) {
358  int line_vector[kRGBRMSColors];
359  int point_vector[kRGBRMSColors];
360  for (int i = 0; i < kRGBRMSColors; ++i) {
361  line_vector[i] = static_cast<int>(line2[i]) - static_cast<int>(line1[i]);
362  point_vector[i] = static_cast<int>(point[i]) - static_cast<int>(line1[i]);
363  }
364  line_vector[L_ALPHA_CHANNEL] = 0;
365  // Now the cross product in 3d.
366  int cross[kRGBRMSColors];
367  cross[COLOR_RED] = line_vector[COLOR_GREEN] * point_vector[COLOR_BLUE]
368  - line_vector[COLOR_BLUE] * point_vector[COLOR_GREEN];
369  cross[COLOR_GREEN] = line_vector[COLOR_BLUE] * point_vector[COLOR_RED]
370  - line_vector[COLOR_RED] * point_vector[COLOR_BLUE];
371  cross[COLOR_BLUE] = line_vector[COLOR_RED] * point_vector[COLOR_GREEN]
372  - line_vector[COLOR_GREEN] * point_vector[COLOR_RED];
373  cross[L_ALPHA_CHANNEL] = 0;
374  // Now the sums of the squares.
375  double cross_sq = 0.0;
376  double line_sq = 0.0;
377  for (int j = 0; j < kRGBRMSColors; ++j) {
378  cross_sq += static_cast<double>(cross[j]) * cross[j];
379  line_sq += static_cast<double>(line_vector[j]) * line_vector[j];
380  }
381  if (line_sq == 0.0) {
382  return 0.0;
383  }
384  return cross_sq / line_sq; // This is the squared distance.
385 }
386 
387 
388 // Returns the leptonica combined code for the given RGB triplet.
389 uint32_t ImageFind::ComposeRGB(uint32_t r, uint32_t g, uint32_t b) {
390  l_uint32 result;
391  composeRGBPixel(r, g, b, &result);
392  return result;
393 }
394 
395 // Returns the input value clipped to a uint8_t.
396 uint8_t ImageFind::ClipToByte(double pixel) {
397  if (pixel < 0.0)
398  return 0;
399  else if (pixel >= 255.0)
400  return 255;
401  return static_cast<uint8_t>(pixel);
402 }
403 
404 // Computes the light and dark extremes of color in the given rectangle of
405 // the given pix, which is factor smaller than the coordinate system in rect.
406 // The light and dark points are taken to be the upper and lower 8th-ile of
407 // the most deviant of R, G and B. The value of the other 2 channels are
408 // computed by linear fit against the most deviant.
409 // The colors of the two points are returned in color1 and color2, with the
410 // alpha channel set to a scaled mean rms of the fits.
411 // If color_map1 is not null then it and color_map2 get rect pasted in them
412 // with the two calculated colors, and rms map gets a pasted rect of the rms.
413 // color_map1, color_map2 and rms_map are assumed to be the same scale as pix.
414 void ImageFind::ComputeRectangleColors(const TBOX& rect, Pix* pix, int factor,
415  Pix* color_map1, Pix* color_map2,
416  Pix* rms_map,
417  uint8_t* color1, uint8_t* color2) {
418  ASSERT_HOST(pix != nullptr && pixGetDepth(pix) == 32);
419  // Pad the rectangle outwards by 2 (scaled) pixels if possible to get more
420  // background.
421  int width = pixGetWidth(pix);
422  int height = pixGetHeight(pix);
423  int left_pad = std::max(rect.left() - 2 * factor, 0) / factor;
424  int top_pad = (rect.top() + 2 * factor + (factor - 1)) / factor;
425  top_pad = std::min(height, top_pad);
426  int right_pad = (rect.right() + 2 * factor + (factor - 1)) / factor;
427  right_pad = std::min(width, right_pad);
428  int bottom_pad = std::max(rect.bottom() - 2 * factor, 0) / factor;
429  int width_pad = right_pad - left_pad;
430  int height_pad = top_pad - bottom_pad;
431  if (width_pad < 1 || height_pad < 1 || width_pad + height_pad < 4)
432  return;
433  // Now crop the pix to the rectangle.
434  Box* scaled_box = boxCreate(left_pad, height - top_pad,
435  width_pad, height_pad);
436  Pix* scaled = pixClipRectangle(pix, scaled_box, nullptr);
437 
438  // Compute stats over the whole image.
439  STATS red_stats(0, 256);
440  STATS green_stats(0, 256);
441  STATS blue_stats(0, 256);
442  uint32_t* data = pixGetData(scaled);
443  ASSERT_HOST(pixGetWpl(scaled) == width_pad);
444  for (int y = 0; y < height_pad; ++y) {
445  for (int x = 0; x < width_pad; ++x, ++data) {
446  int r = GET_DATA_BYTE(data, COLOR_RED);
447  int g = GET_DATA_BYTE(data, COLOR_GREEN);
448  int b = GET_DATA_BYTE(data, COLOR_BLUE);
449  red_stats.add(r, 1);
450  green_stats.add(g, 1);
451  blue_stats.add(b, 1);
452  }
453  }
454  // Find the RGB component with the greatest 8th-ile-range.
455  // 8th-iles are used instead of quartiles to get closer to the true
456  // foreground color, which is going to be faint at best because of the
457  // pre-scaling of the input image.
458  int best_l8 = static_cast<int>(red_stats.ile(0.125f));
459  int best_u8 = static_cast<int>(ceil(red_stats.ile(0.875f)));
460  int best_i8r = best_u8 - best_l8;
461  int x_color = COLOR_RED;
462  int y1_color = COLOR_GREEN;
463  int y2_color = COLOR_BLUE;
464  int l8 = static_cast<int>(green_stats.ile(0.125f));
465  int u8 = static_cast<int>(ceil(green_stats.ile(0.875f)));
466  if (u8 - l8 > best_i8r) {
467  best_i8r = u8 - l8;
468  best_l8 = l8;
469  best_u8 = u8;
470  x_color = COLOR_GREEN;
471  y1_color = COLOR_RED;
472  }
473  l8 = static_cast<int>(blue_stats.ile(0.125f));
474  u8 = static_cast<int>(ceil(blue_stats.ile(0.875f)));
475  if (u8 - l8 > best_i8r) {
476  best_i8r = u8 - l8;
477  best_l8 = l8;
478  best_u8 = u8;
479  x_color = COLOR_BLUE;
480  y1_color = COLOR_GREEN;
481  y2_color = COLOR_RED;
482  }
483  if (best_i8r >= kMinColorDifference) {
484  LLSQ line1;
485  LLSQ line2;
486  uint32_t* data = pixGetData(scaled);
487  for (int im_y = 0; im_y < height_pad; ++im_y) {
488  for (int im_x = 0; im_x < width_pad; ++im_x, ++data) {
489  int x = GET_DATA_BYTE(data, x_color);
490  int y1 = GET_DATA_BYTE(data, y1_color);
491  int y2 = GET_DATA_BYTE(data, y2_color);
492  line1.add(x, y1);
493  line2.add(x, y2);
494  }
495  }
496  double m1 = line1.m();
497  double c1 = line1.c(m1);
498  double m2 = line2.m();
499  double c2 = line2.c(m2);
500  double rms = line1.rms(m1, c1) + line2.rms(m2, c2);
501  rms *= kRMSFitScaling;
502  // Save the results.
503  color1[x_color] = ClipToByte(best_l8);
504  color1[y1_color] = ClipToByte(m1 * best_l8 + c1 + 0.5);
505  color1[y2_color] = ClipToByte(m2 * best_l8 + c2 + 0.5);
506  color1[L_ALPHA_CHANNEL] = ClipToByte(rms);
507  color2[x_color] = ClipToByte(best_u8);
508  color2[y1_color] = ClipToByte(m1 * best_u8 + c1 + 0.5);
509  color2[y2_color] = ClipToByte(m2 * best_u8 + c2 + 0.5);
510  color2[L_ALPHA_CHANNEL] = ClipToByte(rms);
511  } else {
512  // There is only one color.
513  color1[COLOR_RED] = ClipToByte(red_stats.median());
514  color1[COLOR_GREEN] = ClipToByte(green_stats.median());
515  color1[COLOR_BLUE] = ClipToByte(blue_stats.median());
516  color1[L_ALPHA_CHANNEL] = 0;
517  memcpy(color2, color1, 4);
518  }
519  if (color_map1 != nullptr) {
520  pixSetInRectArbitrary(color_map1, scaled_box,
521  ComposeRGB(color1[COLOR_RED],
522  color1[COLOR_GREEN],
523  color1[COLOR_BLUE]));
524  pixSetInRectArbitrary(color_map2, scaled_box,
525  ComposeRGB(color2[COLOR_RED],
526  color2[COLOR_GREEN],
527  color2[COLOR_BLUE]));
528  pixSetInRectArbitrary(rms_map, scaled_box, color1[L_ALPHA_CHANNEL]);
529  }
530  pixDestroy(&scaled);
531  boxDestroy(&scaled_box);
532 }
533 
534 // ================ CUTTING POLYGONAL IMAGES FROM A RECTANGLE ================
535 // The following functions are responsible for cutting a polygonal image from
536 // a rectangle: CountPixelsInRotatedBox, AttemptToShrinkBox, CutChunkFromParts
537 // with DivideImageIntoParts as the master.
538 // Problem statement:
539 // We start with a single connected component from the image mask: we get
540 // a Pix of the component, and its location on the page (im_box).
541 // The objective of cutting a polygonal image from its rectangle is to avoid
542 // interfering text, but not text that completely overlaps the image.
543 // ------------------------------ ------------------------------
544 // | Single input partition | | 1 Cut up output partitions |
545 // | | ------------------------------
546 // Av|oid | Avoid | |
547 // | | |________________________|
548 // Int|erfering | Interfering | |
549 // | | _____|__________________|
550 // T|ext | Text | |
551 // | Text-on-image | | Text-on-image |
552 // ------------------------------ --------------------------
553 // DivideImageIntoParts does this by building a ColPartition_LIST (not in the
554 // grid) with each ColPartition representing one of the rectangles needed,
555 // starting with a single rectangle for the whole image component, and cutting
556 // bits out of it with CutChunkFromParts as needed to avoid text. The output
557 // ColPartitions are supposed to be ordered from top to bottom.
558 
559 // The problem is complicated by the fact that we have rotated the coordinate
560 // system to make text lines horizontal, so if we need to look at the component
561 // image, we have to rotate the coordinates. Throughout the functions in this
562 // section im_box is the rectangle representing the image component in the
563 // rotated page coordinates (where we are building our output ColPartitions),
564 // rotation is the rotation that we used to get there, and rerotation is the
565 // rotation required to get back to original page image coordinates.
566 // To get to coordinates in the component image, pix, we rotate the im_box,
567 // the point we want to locate, and subtract the rotated point from the top-left
568 // of the rotated im_box.
569 // im_box is therefore essential to calculating coordinates within the pix.
570 
571 // Returns true if there are no black pixels in between the boxes.
572 // The im_box must represent the bounding box of the pix in tesseract
573 // coordinates, which may be negative, due to rotations to make the textlines
574 // horizontal. The boxes are rotated by rotation, which should undo such
575 // rotations, before mapping them onto the pix.
576 bool ImageFind::BlankImageInBetween(const TBOX& box1, const TBOX& box2,
577  const TBOX& im_box, const FCOORD& rotation,
578  Pix* pix) {
579  TBOX search_box(box1);
580  search_box += box2;
581  if (box1.x_gap(box2) >= box1.y_gap(box2)) {
582  if (box1.x_gap(box2) <= 0)
583  return true;
584  search_box.set_left(std::min(box1.right(), box2.right()));
585  search_box.set_right(std::max(box1.left(), box2.left()));
586  } else {
587  if (box1.y_gap(box2) <= 0)
588  return true;
589  search_box.set_top(std::max(box1.bottom(), box2.bottom()));
590  search_box.set_bottom(std::min(box1.top(), box2.top()));
591  }
592  return CountPixelsInRotatedBox(search_box, im_box, rotation, pix) == 0;
593 }
594 
595 // Returns the number of pixels in box in the pix.
596 // rotation, pix and im_box are defined in the large comment above.
598  const FCOORD& rotation, Pix* pix) {
599  // Intersect it with the image box.
600  box &= im_box; // This is in-place box intersection.
601  if (box.null_box())
602  return 0;
603  box.rotate(rotation);
604  TBOX rotated_im_box(im_box);
605  rotated_im_box.rotate(rotation);
606  Pix* rect_pix = pixCreate(box.width(), box.height(), 1);
607  pixRasterop(rect_pix, 0, 0, box.width(), box.height(),
608  PIX_SRC, pix, box.left() - rotated_im_box.left(),
609  rotated_im_box.top() - box.top());
610  l_int32 result;
611  pixCountPixels(rect_pix, &result, nullptr);
612  pixDestroy(&rect_pix);
613  return result;
614 }
615 
616 // The box given by slice contains some black pixels, but not necessarily
617 // over the whole box. Shrink the x bounds of slice, but not the y bounds
618 // until there is at least one black pixel in the outermost columns.
619 // rotation, rerotation, pix and im_box are defined in the large comment above.
620 static void AttemptToShrinkBox(const FCOORD& rotation, const FCOORD& rerotation,
621  const TBOX& im_box, Pix* pix, TBOX* slice) {
622  TBOX rotated_box(*slice);
623  rotated_box.rotate(rerotation);
624  TBOX rotated_im_box(im_box);
625  rotated_im_box.rotate(rerotation);
626  int left = rotated_box.left() - rotated_im_box.left();
627  int right = rotated_box.right() - rotated_im_box.left();
628  int top = rotated_im_box.top() - rotated_box.top();
629  int bottom = rotated_im_box.top() - rotated_box.bottom();
630  ImageFind::BoundsWithinRect(pix, &left, &top, &right, &bottom);
631  top = rotated_im_box.top() - top;
632  bottom = rotated_im_box.top() - bottom;
633  left += rotated_im_box.left();
634  right += rotated_im_box.left();
635  rotated_box.set_to_given_coords(left, bottom, right, top);
636  rotated_box.rotate(rotation);
637  slice->set_left(rotated_box.left());
638  slice->set_right(rotated_box.right());
639 }
640 
641 // The meat of cutting a polygonal image around text.
642 // This function covers the general case of cutting a box out of a box
643 // as shown:
644 // Input Output
645 // ------------------------------ ------------------------------
646 // | Single input partition | | 1 Cut up output partitions |
647 // | | ------------------------------
648 // | ---------- | --------- ----------
649 // | | box | | | 2 | box | 3 |
650 // | | | | | | is cut | |
651 // | ---------- | --------- out ----------
652 // | | ------------------------------
653 // | | | 4 |
654 // ------------------------------ ------------------------------
655 // In the context that this function is used, at most 3 of the above output
656 // boxes will be created, as the overlapping box is never contained by the
657 // input.
658 // The above cutting operation is executed for each element of part_list that
659 // is overlapped by the input box. Each modified ColPartition is replaced
660 // in place in the list by the output of the cutting operation in the order
661 // shown above, so iff no holes are ever created, the output will be in
662 // top-to-bottom order, but in extreme cases, hole creation is possible.
663 // In such cases, the output order may cause strange block polygons.
664 // rotation, rerotation, pix and im_box are defined in the large comment above.
665 static void CutChunkFromParts(const TBOX& box, const TBOX& im_box,
666  const FCOORD& rotation, const FCOORD& rerotation,
667  Pix* pix, ColPartition_LIST* part_list) {
668  ASSERT_HOST(!part_list->empty());
669  ColPartition_IT part_it(part_list);
670  do {
671  ColPartition* part = part_it.data();
672  TBOX part_box = part->bounding_box();
673  if (part_box.overlap(box)) {
674  // This part must be cut and replaced with the remains. There are
675  // up to 4 pieces to be made. Start with the first one and use
676  // add_before_stay_put. For each piece if it has no black pixels
677  // left, just don't make the box.
678  // Above box.
679  if (box.top() < part_box.top()) {
680  TBOX slice(part_box);
681  slice.set_bottom(box.top());
682  if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation,
683  pix) > 0) {
684  AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice);
685  part_it.add_before_stay_put(
687  BTFT_NONTEXT));
688  }
689  }
690  // Left of box.
691  if (box.left() > part_box.left()) {
692  TBOX slice(part_box);
693  slice.set_right(box.left());
694  if (box.top() < part_box.top())
695  slice.set_top(box.top());
696  if (box.bottom() > part_box.bottom())
697  slice.set_bottom(box.bottom());
698  if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation,
699  pix) > 0) {
700  AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice);
701  part_it.add_before_stay_put(
703  BTFT_NONTEXT));
704  }
705  }
706  // Right of box.
707  if (box.right() < part_box.right()) {
708  TBOX slice(part_box);
709  slice.set_left(box.right());
710  if (box.top() < part_box.top())
711  slice.set_top(box.top());
712  if (box.bottom() > part_box.bottom())
713  slice.set_bottom(box.bottom());
714  if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation,
715  pix) > 0) {
716  AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice);
717  part_it.add_before_stay_put(
719  BTFT_NONTEXT));
720  }
721  }
722  // Below box.
723  if (box.bottom() > part_box.bottom()) {
724  TBOX slice(part_box);
725  slice.set_top(box.bottom());
726  if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation,
727  pix) > 0) {
728  AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice);
729  part_it.add_before_stay_put(
731  BTFT_NONTEXT));
732  }
733  }
734  part->DeleteBoxes();
735  delete part_it.extract();
736  }
737  part_it.forward();
738  } while (!part_it.at_first());
739 }
740 
741 // Starts with the bounding box of the image component and cuts it up
742 // so that it doesn't intersect text where possible.
743 // Strong fully contained horizontal text is marked as text on image,
744 // and does not cause a division of the image.
745 // For more detail see the large comment above on cutting polygonal images
746 // from a rectangle.
747 // rotation, rerotation, pix and im_box are defined in the large comment above.
748 static void DivideImageIntoParts(const TBOX& im_box, const FCOORD& rotation,
749  const FCOORD& rerotation, Pix* pix,
750  ColPartitionGridSearch* rectsearch,
751  ColPartition_LIST* part_list) {
752  // Add the full im_box partition to the list to begin with.
753  ColPartition* pix_part = ColPartition::FakePartition(im_box, PT_UNKNOWN,
755  BTFT_NONTEXT);
756  ColPartition_IT part_it(part_list);
757  part_it.add_after_then_move(pix_part);
758 
759  rectsearch->StartRectSearch(im_box);
760  ColPartition* part;
761  while ((part = rectsearch->NextRectSearch()) != nullptr) {
762  TBOX part_box = part->bounding_box();
763  if (part_box.contains(im_box) && part->flow() >= BTFT_CHAIN) {
764  // This image is completely covered by an existing text partition.
765  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
766  ColPartition* pix_part = part_it.extract();
767  pix_part->DeleteBoxes();
768  delete pix_part;
769  }
770  } else if (part->flow() == BTFT_STRONG_CHAIN) {
771  // Text intersects the box.
772  TBOX overlap_box = part_box.intersection(im_box);
773  // Intersect it with the image box.
774  int black_area = ImageFind::CountPixelsInRotatedBox(overlap_box, im_box,
775  rerotation, pix);
776  if (black_area * 2 < part_box.area() || !im_box.contains(part_box)) {
777  // Eat a piece out of the image.
778  // Pad it so that pieces eaten out look decent.
779  int padding = part->blob_type() == BRT_VERT_TEXT
780  ? part_box.width() : part_box.height();
781  part_box.set_top(part_box.top() + padding / 2);
782  part_box.set_bottom(part_box.bottom() - padding / 2);
783  CutChunkFromParts(part_box, im_box, rotation, rerotation,
784  pix, part_list);
785  } else {
786  // Strong overlap with the black area, so call it text on image.
787  part->set_flow(BTFT_TEXT_ON_IMAGE);
788  }
789  }
790  if (part_list->empty()) {
791  break;
792  }
793  }
794 }
795 
796 // Search for the rightmost text that overlaps vertically and is to the left
797 // of the given box, but within the given left limit.
798 static int ExpandImageLeft(const TBOX& box, int left_limit,
799  ColPartitionGrid* part_grid) {
800  ColPartitionGridSearch search(part_grid);
801  ColPartition* part;
802  // Search right to left for any text that overlaps.
803  search.StartSideSearch(box.left(), box.bottom(), box.top());
804  while ((part = search.NextSideSearch(true)) != nullptr) {
805  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
806  const TBOX& part_box(part->bounding_box());
807  if (part_box.y_gap(box) < 0) {
808  if (part_box.right() > left_limit && part_box.right() < box.left())
809  left_limit = part_box.right();
810  break;
811  }
812  }
813  }
814  if (part != nullptr) {
815  // Search for the nearest text up to the one we already found.
816  TBOX search_box(left_limit, box.bottom(), box.left(), box.top());
817  search.StartRectSearch(search_box);
818  while ((part = search.NextRectSearch()) != nullptr) {
819  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
820  const TBOX& part_box(part->bounding_box());
821  if (part_box.y_gap(box) < 0) {
822  if (part_box.right() > left_limit && part_box.right() < box.left()) {
823  left_limit = part_box.right();
824  }
825  }
826  }
827  }
828  }
829  return left_limit;
830 }
831 
832 // Search for the leftmost text that overlaps vertically and is to the right
833 // of the given box, but within the given right limit.
834 static int ExpandImageRight(const TBOX& box, int right_limit,
835  ColPartitionGrid* part_grid) {
836  ColPartitionGridSearch search(part_grid);
837  ColPartition* part;
838  // Search left to right for any text that overlaps.
839  search.StartSideSearch(box.right(), box.bottom(), box.top());
840  while ((part = search.NextSideSearch(false)) != nullptr) {
841  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
842  const TBOX& part_box(part->bounding_box());
843  if (part_box.y_gap(box) < 0) {
844  if (part_box.left() < right_limit && part_box.left() > box.right())
845  right_limit = part_box.left();
846  break;
847  }
848  }
849  }
850  if (part != nullptr) {
851  // Search for the nearest text up to the one we already found.
852  TBOX search_box(box.left(), box.bottom(), right_limit, box.top());
853  search.StartRectSearch(search_box);
854  while ((part = search.NextRectSearch()) != nullptr) {
855  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
856  const TBOX& part_box(part->bounding_box());
857  if (part_box.y_gap(box) < 0) {
858  if (part_box.left() < right_limit && part_box.left() > box.right())
859  right_limit = part_box.left();
860  }
861  }
862  }
863  }
864  return right_limit;
865 }
866 
867 // Search for the topmost text that overlaps horizontally and is below
868 // the given box, but within the given bottom limit.
869 static int ExpandImageBottom(const TBOX& box, int bottom_limit,
870  ColPartitionGrid* part_grid) {
871  ColPartitionGridSearch search(part_grid);
872  ColPartition* part;
873  // Search right to left for any text that overlaps.
874  search.StartVerticalSearch(box.left(), box.right(), box.bottom());
875  while ((part = search.NextVerticalSearch(true)) != nullptr) {
876  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
877  const TBOX& part_box(part->bounding_box());
878  if (part_box.x_gap(box) < 0) {
879  if (part_box.top() > bottom_limit && part_box.top() < box.bottom())
880  bottom_limit = part_box.top();
881  break;
882  }
883  }
884  }
885  if (part != nullptr) {
886  // Search for the nearest text up to the one we already found.
887  TBOX search_box(box.left(), bottom_limit, box.right(), box.bottom());
888  search.StartRectSearch(search_box);
889  while ((part = search.NextRectSearch()) != nullptr) {
890  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
891  const TBOX& part_box(part->bounding_box());
892  if (part_box.x_gap(box) < 0) {
893  if (part_box.top() > bottom_limit && part_box.top() < box.bottom())
894  bottom_limit = part_box.top();
895  }
896  }
897  }
898  }
899  return bottom_limit;
900 }
901 
902 // Search for the bottommost text that overlaps horizontally and is above
903 // the given box, but within the given top limit.
904 static int ExpandImageTop(const TBOX& box, int top_limit,
905  ColPartitionGrid* part_grid) {
906  ColPartitionGridSearch search(part_grid);
907  ColPartition* part;
908  // Search right to left for any text that overlaps.
909  search.StartVerticalSearch(box.left(), box.right(), box.top());
910  while ((part = search.NextVerticalSearch(false)) != nullptr) {
911  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
912  const TBOX& part_box(part->bounding_box());
913  if (part_box.x_gap(box) < 0) {
914  if (part_box.bottom() < top_limit && part_box.bottom() > box.top())
915  top_limit = part_box.bottom();
916  break;
917  }
918  }
919  }
920  if (part != nullptr) {
921  // Search for the nearest text up to the one we already found.
922  TBOX search_box(box.left(), box.top(), box.right(), top_limit);
923  search.StartRectSearch(search_box);
924  while ((part = search.NextRectSearch()) != nullptr) {
925  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
926  const TBOX& part_box(part->bounding_box());
927  if (part_box.x_gap(box) < 0) {
928  if (part_box.bottom() < top_limit && part_box.bottom() > box.top())
929  top_limit = part_box.bottom();
930  }
931  }
932  }
933  }
934  return top_limit;
935 }
936 
937 // Expands the image box in the given direction until it hits text,
938 // limiting the expansion to the given limit box, returning the result
939 // in the expanded box, and
940 // returning the increase in area resulting from the expansion.
941 static int ExpandImageDir(BlobNeighbourDir dir, const TBOX& im_box,
942  const TBOX& limit_box,
943  ColPartitionGrid* part_grid, TBOX* expanded_box) {
944  *expanded_box = im_box;
945  switch (dir) {
946  case BND_LEFT:
947  expanded_box->set_left(ExpandImageLeft(im_box, limit_box.left(),
948  part_grid));
949  break;
950  case BND_RIGHT:
951  expanded_box->set_right(ExpandImageRight(im_box, limit_box.right(),
952  part_grid));
953  break;
954  case BND_ABOVE:
955  expanded_box->set_top(ExpandImageTop(im_box, limit_box.top(), part_grid));
956  break;
957  case BND_BELOW:
958  expanded_box->set_bottom(ExpandImageBottom(im_box, limit_box.bottom(),
959  part_grid));
960  break;
961  default:
962  return 0;
963  }
964  return expanded_box->area() - im_box.area();
965 }
966 
967 // Expands the image partition into any non-text until it touches text.
968 // The expansion proceeds in the order of increasing increase in area
969 // as a heuristic to find the best rectangle by expanding in the most
970 // constrained direction first.
971 static void MaximalImageBoundingBox(ColPartitionGrid* part_grid, TBOX* im_box) {
972  bool dunnit[BND_COUNT];
973  memset(dunnit, 0, sizeof(dunnit));
974  TBOX limit_box(part_grid->bleft().x(), part_grid->bleft().y(),
975  part_grid->tright().x(), part_grid->tright().y());
976  TBOX text_box(*im_box);
977  for (int iteration = 0; iteration < BND_COUNT; ++iteration) {
978  // Find the direction with least area increase.
979  int best_delta = -1;
980  BlobNeighbourDir best_dir = BND_LEFT;
981  TBOX expanded_boxes[BND_COUNT];
982  for (int dir = 0; dir < BND_COUNT; ++dir) {
983  auto bnd = static_cast<BlobNeighbourDir>(dir);
984  if (!dunnit[bnd]) {
985  TBOX expanded_box;
986  int area_delta = ExpandImageDir(bnd, text_box, limit_box, part_grid,
987  &expanded_boxes[bnd]);
988  if (best_delta < 0 || area_delta < best_delta) {
989  best_delta = area_delta;
990  best_dir = bnd;
991  }
992  }
993  }
994  // Run the best and remember the direction.
995  dunnit[best_dir] = true;
996  text_box = expanded_boxes[best_dir];
997  }
998  *im_box = text_box;
999 }
1000 
1001 // Helper deletes the given partition but first marks up all the blobs as
1002 // noise, so they get deleted later, and disowns them.
1003 // If the initial type of the partition is image, then it actually deletes
1004 // the blobs, as the partition owns them in that case.
1005 static void DeletePartition(ColPartition* part) {
1006  BlobRegionType type = part->blob_type();
1007  if (type == BRT_RECTIMAGE || type == BRT_POLYIMAGE) {
1008  // The partition owns the boxes of these types, so just delete them.
1009  part->DeleteBoxes(); // From a previous iteration.
1010  } else {
1011  // Once marked, the blobs will be swept up by TidyBlobs.
1012  part->set_flow(BTFT_NONTEXT);
1013  part->set_blob_type(BRT_NOISE);
1014  part->SetBlobTypes();
1015  part->DisownBoxes(); // Created before FindImagePartitions.
1016  }
1017  delete part;
1018 }
1019 
1020 // The meat of joining fragmented images and consuming ColPartitions of
1021 // uncertain type.
1022 // *part_ptr is an input/output BRT_RECTIMAGE ColPartition that is to be
1023 // expanded to consume overlapping and nearby ColPartitions of uncertain type
1024 // and other BRT_RECTIMAGE partitions, but NOT to be expanded beyond
1025 // max_image_box. *part_ptr is NOT in the part_grid.
1026 // rectsearch is already constructed on the part_grid, and is used for
1027 // searching for overlapping and nearby ColPartitions.
1028 // ExpandImageIntoParts is called iteratively until it returns false. Each
1029 // time it absorbs the nearest non-contained candidate, and everything that
1030 // is fully contained within part_ptr's bounding box.
1031 // TODO(rays) what if it just eats everything inside max_image_box in one go?
1032 static bool ExpandImageIntoParts(const TBOX& max_image_box,
1033  ColPartitionGridSearch* rectsearch,
1034  ColPartitionGrid* part_grid,
1035  ColPartition** part_ptr) {
1036  ColPartition* image_part = *part_ptr;
1037  TBOX im_part_box = image_part->bounding_box();
1038  if (textord_tabfind_show_images > 1) {
1039  tprintf("Searching for merge with image part:");
1040  im_part_box.print();
1041  tprintf("Text box=");
1042  max_image_box.print();
1043  }
1044  rectsearch->StartRectSearch(max_image_box);
1045  ColPartition* part;
1046  ColPartition* best_part = nullptr;
1047  int best_dist = 0;
1048  while ((part = rectsearch->NextRectSearch()) != nullptr) {
1049  if (textord_tabfind_show_images > 1) {
1050  tprintf("Considering merge with part:");
1051  part->Print();
1052  if (im_part_box.contains(part->bounding_box()))
1053  tprintf("Fully contained\n");
1054  else if (!max_image_box.contains(part->bounding_box()))
1055  tprintf("Not within text box\n");
1056  else if (part->flow() == BTFT_STRONG_CHAIN)
1057  tprintf("Too strong text\n");
1058  else
1059  tprintf("Real candidate\n");
1060  }
1061  if (part->flow() == BTFT_STRONG_CHAIN ||
1062  part->flow() == BTFT_TEXT_ON_IMAGE ||
1063  part->blob_type() == BRT_POLYIMAGE)
1064  continue;
1065  TBOX box = part->bounding_box();
1066  if (max_image_box.contains(box) && part->blob_type() != BRT_NOISE) {
1067  if (im_part_box.contains(box)) {
1068  // Eat it completely.
1069  rectsearch->RemoveBBox();
1070  DeletePartition(part);
1071  continue;
1072  }
1073  int x_dist = std::max(0, box.x_gap(im_part_box));
1074  int y_dist = std::max(0, box.y_gap(im_part_box));
1075  int dist = x_dist * x_dist + y_dist * y_dist;
1076  if (dist > box.area() || dist > im_part_box.area())
1077  continue; // Not close enough.
1078  if (best_part == nullptr || dist < best_dist) {
1079  // We keep the nearest qualifier, which is not necessarily the nearest.
1080  best_part = part;
1081  best_dist = dist;
1082  }
1083  }
1084  }
1085  if (best_part != nullptr) {
1086  // It needs expanding. We can do it without touching text.
1087  TBOX box = best_part->bounding_box();
1088  if (textord_tabfind_show_images > 1) {
1089  tprintf("Merging image part:");
1090  im_part_box.print();
1091  tprintf("with part:");
1092  box.print();
1093  }
1094  im_part_box += box;
1095  *part_ptr = ColPartition::FakePartition(im_part_box, PT_UNKNOWN,
1096  BRT_RECTIMAGE,
1097  BTFT_NONTEXT);
1098  DeletePartition(image_part);
1099  part_grid->RemoveBBox(best_part);
1100  DeletePartition(best_part);
1101  rectsearch->RepositionIterator();
1102  return true;
1103  }
1104  return false;
1105 }
1106 
1107 // Helper function to compute the overlap area between the box and the
1108 // given list of partitions.
1109 static int IntersectArea(const TBOX& box, ColPartition_LIST* part_list) {
1110  int intersect_area = 0;
1111  ColPartition_IT part_it(part_list);
1112  // Iterate the parts and subtract intersecting area.
1113  for (part_it.mark_cycle_pt(); !part_it.cycled_list();
1114  part_it.forward()) {
1115  ColPartition* image_part = part_it.data();
1116  TBOX intersect = box.intersection(image_part->bounding_box());
1117  intersect_area += intersect.area();
1118  }
1119  return intersect_area;
1120 }
1121 
1122 // part_list is a set of ColPartitions representing a polygonal image, and
1123 // im_box is the union of the bounding boxes of all the parts in part_list.
1124 // Tests whether part is to be consumed by the polygonal image.
1125 // Returns true if part is weak text and more than half of its area is
1126 // intersected by parts from the part_list, and it is contained within im_box.
1127 static bool TestWeakIntersectedPart(const TBOX& im_box,
1128  ColPartition_LIST* part_list,
1129  ColPartition* part) {
1130  if (part->flow() < BTFT_STRONG_CHAIN) {
1131  // A weak partition intersects the box.
1132  const TBOX& part_box = part->bounding_box();
1133  if (im_box.contains(part_box)) {
1134  int area = part_box.area();
1135  int intersect_area = IntersectArea(part_box, part_list);
1136  if (area < 2 * intersect_area) {
1137  return true;
1138  }
1139  }
1140  }
1141  return false;
1142 }
1143 
1144 // A rectangular or polygonal image has been completed, in part_list, bounding
1145 // box in im_box. We want to eliminate weak text or other uncertain partitions
1146 // (basically anything that is not BRT_STRONG_CHAIN or better) from both the
1147 // part_grid and the big_parts list that are contained within im_box and
1148 // overlapped enough by the possibly polygonal image.
1149 static void EliminateWeakParts(const TBOX& im_box,
1150  ColPartitionGrid* part_grid,
1151  ColPartition_LIST* big_parts,
1152  ColPartition_LIST* part_list) {
1153  ColPartitionGridSearch rectsearch(part_grid);
1154  ColPartition* part;
1155  rectsearch.StartRectSearch(im_box);
1156  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1157  if (TestWeakIntersectedPart(im_box, part_list, part)) {
1158  BlobRegionType type = part->blob_type();
1159  if (type == BRT_POLYIMAGE || type == BRT_RECTIMAGE) {
1160  rectsearch.RemoveBBox();
1161  DeletePartition(part);
1162  } else {
1163  // The part is mostly covered, so mark it. Non-image partitions are
1164  // kept hanging around to mark the image for pass2
1165  part->set_flow(BTFT_NONTEXT);
1166  part->set_blob_type(BRT_NOISE);
1167  part->SetBlobTypes();
1168  }
1169  }
1170  }
1171  ColPartition_IT big_it(big_parts);
1172  for (big_it.mark_cycle_pt(); !big_it.cycled_list(); big_it.forward()) {
1173  part = big_it.data();
1174  if (TestWeakIntersectedPart(im_box, part_list, part)) {
1175  // Once marked, the blobs will be swept up by TidyBlobs.
1176  DeletePartition(big_it.extract());
1177  }
1178  }
1179 }
1180 
1181 // Helper scans for good text partitions overlapping the given box.
1182 // If there are no good text partitions overlapping an expanded box, then
1183 // the box is expanded, otherwise, the original box is returned.
1184 // If good text overlaps the box, true is returned.
1185 static bool ScanForOverlappingText(ColPartitionGrid* part_grid, TBOX* box) {
1186  ColPartitionGridSearch rectsearch(part_grid);
1187  TBOX padded_box(*box);
1188  padded_box.pad(kNoisePadding, kNoisePadding);
1189  rectsearch.StartRectSearch(padded_box);
1190  ColPartition* part;
1191  bool any_text_in_padded_rect = false;
1192  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1193  if (part->flow() == BTFT_CHAIN ||
1194  part->flow() == BTFT_STRONG_CHAIN) {
1195  // Text intersects the box.
1196  any_text_in_padded_rect = true;
1197  const TBOX& part_box = part->bounding_box();
1198  if (box->overlap(part_box)) {
1199  return true;
1200  }
1201  }
1202  }
1203  if (!any_text_in_padded_rect)
1204  *box = padded_box;
1205  return false;
1206 }
1207 
1208 // Renders the boxes of image parts from the supplied list onto the image_pix,
1209 // except where they interfere with existing strong text in the part_grid,
1210 // and then deletes them.
1211 // Box coordinates are rotated by rerotate to match the image.
1212 static void MarkAndDeleteImageParts(const FCOORD& rerotate,
1213  ColPartitionGrid* part_grid,
1214  ColPartition_LIST* image_parts,
1215  Pix* image_pix) {
1216  if (image_pix == nullptr)
1217  return;
1218  int imageheight = pixGetHeight(image_pix);
1219  ColPartition_IT part_it(image_parts);
1220  for (; !part_it.empty(); part_it.forward()) {
1221  ColPartition* part = part_it.extract();
1222  TBOX part_box = part->bounding_box();
1223  BlobRegionType type = part->blob_type();
1224  if (!ScanForOverlappingText(part_grid, &part_box) ||
1225  type == BRT_RECTIMAGE || type == BRT_POLYIMAGE) {
1226  // Mark the box on the image.
1227  // All coords need to be rotated to match the image.
1228  part_box.rotate(rerotate);
1229  int left = part_box.left();
1230  int top = part_box.top();
1231  pixRasterop(image_pix, left, imageheight - top,
1232  part_box.width(), part_box.height(), PIX_SET, nullptr, 0, 0);
1233  }
1234  DeletePartition(part);
1235  }
1236 }
1237 
1238 // Locates all the image partitions in the part_grid, that were found by a
1239 // previous call to FindImagePartitions, marks them in the image_mask,
1240 // removes them from the grid, and deletes them. This makes it possible to
1241 // call FindImagePartitions again to produce less broken-up and less
1242 // overlapping image partitions.
1243 // rerotation specifies how to rotate the partition coords to match
1244 // the image_mask, since this function is used after orientation correction.
1246  ColPartitionGrid* part_grid,
1247  Pix* image_mask) {
1248  // Extract the noise parts from the grid and put them on a temporary list.
1249  ColPartition_LIST parts_list;
1250  ColPartition_IT part_it(&parts_list);
1251  ColPartitionGridSearch gsearch(part_grid);
1252  gsearch.StartFullSearch();
1253  ColPartition* part;
1254  while ((part = gsearch.NextFullSearch()) != nullptr) {
1255  BlobRegionType type = part->blob_type();
1256  if (type == BRT_NOISE || type == BRT_RECTIMAGE || type == BRT_POLYIMAGE) {
1257  part_it.add_after_then_move(part);
1258  gsearch.RemoveBBox();
1259  }
1260  }
1261  // Render listed noise partitions to the image mask.
1262  MarkAndDeleteImageParts(rerotation, part_grid, &parts_list, image_mask);
1263 }
1264 
1265 // Removes and deletes all image partitions that are too small to be worth
1266 // keeping. We have to do this as a separate phase after creating the image
1267 // partitions as the small images are needed to join the larger ones together.
1268 static void DeleteSmallImages(ColPartitionGrid* part_grid) {
1269  if (part_grid != nullptr) return;
1270  ColPartitionGridSearch gsearch(part_grid);
1271  gsearch.StartFullSearch();
1272  ColPartition* part;
1273  while ((part = gsearch.NextFullSearch()) != nullptr) {
1274  // Only delete rectangular images, since if it became a poly image, it
1275  // is more evidence that it is somehow important.
1276  if (part->blob_type() == BRT_RECTIMAGE) {
1277  const TBOX& part_box = part->bounding_box();
1278  if (part_box.width() < kMinImageFindSize ||
1279  part_box.height() < kMinImageFindSize) {
1280  // It is too small to keep. Just make it disappear.
1281  gsearch.RemoveBBox();
1282  DeletePartition(part);
1283  }
1284  }
1285  }
1286 }
1287 
1288 // Runs a CC analysis on the image_pix mask image, and creates
1289 // image partitions from them, cutting out strong text, and merging with
1290 // nearby image regions such that they don't interfere with text.
1291 // Rotation and rerotation specify how to rotate image coords to match
1292 // the blob and partition coords and back again.
1293 // The input/output part_grid owns all the created partitions, and
1294 // the partitions own all the fake blobs that belong in the partitions.
1295 // Since the other blobs in the other partitions will be owned by the block,
1296 // ColPartitionGrid::ReTypeBlobs must be called afterwards to fix this
1297 // situation and collect the image blobs.
1298 void ImageFind::FindImagePartitions(Pix* image_pix, const FCOORD& rotation,
1299  const FCOORD& rerotation, TO_BLOCK* block,
1300  TabFind* tab_grid, DebugPixa* pixa_debug,
1301  ColPartitionGrid* part_grid,
1302  ColPartition_LIST* big_parts) {
1303  int imageheight = pixGetHeight(image_pix);
1304  Boxa* boxa;
1305  Pixa* pixa;
1306  ConnCompAndRectangularize(image_pix, pixa_debug, &boxa, &pixa);
1307  // Iterate the connected components in the image regions mask.
1308  int nboxes = 0;
1309  if (boxa != nullptr && pixa != nullptr) nboxes = boxaGetCount(boxa);
1310  for (int i = 0; i < nboxes; ++i) {
1311  l_int32 x, y, width, height;
1312  boxaGetBoxGeometry(boxa, i, &x, &y, &width, &height);
1313  Pix* pix = pixaGetPix(pixa, i, L_CLONE);
1314  TBOX im_box(x, imageheight -y - height, x + width, imageheight - y);
1315  im_box.rotate(rotation); // Now matches all partitions and blobs.
1316  ColPartitionGridSearch rectsearch(part_grid);
1317  rectsearch.SetUniqueMode(true);
1318  ColPartition_LIST part_list;
1319  DivideImageIntoParts(im_box, rotation, rerotation, pix,
1320  &rectsearch, &part_list);
1321  if (textord_tabfind_show_images && pixa_debug != nullptr) {
1322  pixa_debug->AddPix(pix, "ImageComponent");
1323  tprintf("Component has %d parts\n", part_list.length());
1324  }
1325  pixDestroy(&pix);
1326  if (!part_list.empty()) {
1327  ColPartition_IT part_it(&part_list);
1328  if (part_list.singleton()) {
1329  // We didn't have to chop it into a polygon to fit around text, so
1330  // try expanding it to merge fragmented image parts, as long as it
1331  // doesn't touch strong text.
1332  ColPartition* part = part_it.extract();
1333  TBOX text_box(im_box);
1334  MaximalImageBoundingBox(part_grid, &text_box);
1335  while (ExpandImageIntoParts(text_box, &rectsearch, part_grid, &part));
1336  part_it.set_to_list(&part_list);
1337  part_it.add_after_then_move(part);
1338  im_box = part->bounding_box();
1339  }
1340  EliminateWeakParts(im_box, part_grid, big_parts, &part_list);
1341  // Iterate the part_list and put the parts into the grid.
1342  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
1343  ColPartition* image_part = part_it.extract();
1344  im_box = image_part->bounding_box();
1345  part_grid->InsertBBox(true, true, image_part);
1346  if (!part_it.at_last()) {
1347  ColPartition* neighbour = part_it.data_relative(1);
1348  image_part->AddPartner(false, neighbour);
1349  neighbour->AddPartner(true, image_part);
1350  }
1351  }
1352  }
1353  }
1354  boxaDestroy(&boxa);
1355  pixaDestroy(&pixa);
1356  DeleteSmallImages(part_grid);
1357  if (textord_tabfind_show_images) {
1358  ScrollView* images_win_ = part_grid->MakeWindow(1000, 400, "With Images");
1359  part_grid->DisplayBoxes(images_win_);
1360  }
1361 }
1362 
1363 
1364 } // namespace tesseract.
tesseract::ImageFind::BoundsWithinRect
static bool BoundsWithinRect(Pix *pix, int *x_start, int *y_start, int *x_end, int *y_end)
Definition: imagefind.cpp:332
ScrollView
Definition: scrollview.h:97
INT_VAR
#define INT_VAR(name, val, comment)
Definition: params.h:300
tesseract::ImageFind::CountPixelsInRotatedBox
static int CountPixelsInRotatedBox(TBOX box, const TBOX &im_box, const FCOORD &rotation, Pix *pix)
Definition: imagefind.cpp:597
BND_RIGHT
Definition: blobbox.h:89
tesseract::kMinImageFindSize
const int kMinImageFindSize
Definition: imagefind.cpp:47
tesseract::kRGBRMSColors
const int kRGBRMSColors
Definition: colpartition.h:36
TBOX::intersection
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:83
tesseract::kMinColorDifference
const int kMinColorDifference
Definition: imagefind.cpp:51
LLSQ::add
void add(double x, double y)
Definition: linlsq.cpp:45
BTFT_STRONG_CHAIN
Definition: blobbox.h:118
tesseract::kMaxRectangularGradient
const double kMaxRectangularGradient
Definition: imagefind.cpp:45
LLSQ
Definition: linlsq.h:27
tesseract::BBGrid::InsertBBox
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
tesseract::kNoisePadding
const int kNoisePadding
Definition: ccnontextdetect.cpp:50
tesseract::BBGrid::MakeWindow
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:589
BRT_NOISE
Definition: blobbox.h:72
tesseract::ImageFind::ConnCompAndRectangularize
static void ConnCompAndRectangularize(Pix *pix, DebugPixa *pixa_debug, Boxa **boxa, Pixa **pixa)
Definition: imagefind.cpp:154
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
BND_BELOW
Definition: blobbox.h:88
TBOX::overlap
bool overlap(const TBOX &box) const
Definition: rect.h:350
params.h
LLSQ::c
double c(double m) const
Definition: linlsq.cpp:110
TBOX::print
void print() const
Definition: rect.h:277
tesseract::GridSearch::StartFullSearch
void StartFullSearch()
Definition: bbgrid.h:665
TBOX::top
int16_t top() const
Definition: rect.h:57
TBOX::contains
bool contains(const FCOORD pt) const
Definition: rect.h:330
TBOX::area
int32_t area() const
Definition: rect.h:121
TO_BLOCK
Definition: blobbox.h:691
BRT_VERT_TEXT
Definition: blobbox.h:78
TBOX::set_top
void set_top(int y)
Definition: rect.h:60
tesseract::ColPartitionGridSearch
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:935
BRT_RECTIMAGE
Definition: blobbox.h:75
LLSQ::m
double m() const
Definition: linlsq.cpp:95
FCOORD
Definition: points.h:187
BND_ABOVE
Definition: blobbox.h:90
BTFT_CHAIN
Definition: blobbox.h:117
BRT_POLYIMAGE
Definition: blobbox.h:76
TBOX::rotate
void rotate(const FCOORD &vec)
Definition: rect.h:196
tesseract::ColPartition
Definition: colpartition.h:67
TBOX::height
int16_t height() const
Definition: rect.h:107
TBOX::y_gap
int y_gap(const TBOX &box) const
Definition: rect.h:232
BTFT_NONTEXT
Definition: blobbox.h:115
LLSQ::rms
double rms(double m, double c) const
Definition: linlsq.cpp:123
statistc.h
tesseract::DebugPixa::AddPix
void AddPix(const Pix *pix, const char *caption)
Definition: debugpixa.h:26
TBOX::set_right
void set_right(int x)
Definition: rect.h:81
tesseract::kMaxRectangularFraction
const double kMaxRectangularFraction
Definition: imagefind.cpp:42
BTFT_TEXT_ON_IMAGE
Definition: blobbox.h:119
BlobRegionType
BlobRegionType
Definition: blobbox.h:71
TBOX::null_box
bool null_box() const
Definition: rect.h:49
tesseract::ColPartition::blob_type
BlobRegionType blob_type() const
Definition: colpartition.h:148
TBOX::width
int16_t width() const
Definition: rect.h:114
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
tesseract::ImageFind::FindImagePartitions
static void FindImagePartitions(Pix *image_pix, const FCOORD &rotation, const FCOORD &rerotation, TO_BLOCK *block, TabFind *tab_grid, DebugPixa *pixa_debug, ColPartitionGrid *part_grid, ColPartition_LIST *big_parts)
Definition: imagefind.cpp:1298
linlsq.h
tesseract::kMinRectangularFraction
const double kMinRectangularFraction
Definition: imagefind.cpp:40
tesseract::GridSearch
Definition: bbgrid.h:48
tesseract
Definition: baseapi.h:65
STATS::median
double median() const
Definition: statistc.cpp:218
BND_LEFT
Definition: blobbox.h:87
tesseract::DebugPixa
Definition: debugpixa.h:10
tesseract::ColPartition::FakePartition
static ColPartition * FakePartition(const TBOX &box, PolyBlockType block_type, BlobRegionType blob_type, BlobTextFlowType flow)
Definition: colpartition.cpp:95
PT_UNKNOWN
Definition: capi.h:108
STATS
Definition: statistc.h:30
tesseract::ImageFind::BlankImageInBetween
static bool BlankImageInBetween(const TBOX &box1, const TBOX &box2, const TBOX &im_box, const FCOORD &rotation, Pix *pix)
Definition: imagefind.cpp:576
tesseract::ColPartition::AddPartner
void AddPartner(bool upper, ColPartition *partner)
Definition: colpartition.cpp:603
STATS::ile
double ile(double frac) const
Definition: statistc.cpp:156
tesseract::ImageFind::TransferImagePartsToImageMask
static void TransferImagePartsToImageMask(const FCOORD &rerotation, ColPartitionGrid *part_grid, Pix *image_mask)
Definition: imagefind.cpp:1245
tesseract::ColPartition::bounding_box
const TBOX & bounding_box() const
Definition: colpartition.h:109
tesseract::BBGrid::DisplayBoxes
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:613
tesseract::ImageFind::ComputeRectangleColors
static void ComputeRectangleColors(const TBOX &rect, Pix *pix, int factor, Pix *color_map1, Pix *color_map2, Pix *rms_map, uint8_t *color1, uint8_t *color2)
Definition: imagefind.cpp:414
imagefind.h
tesseract::TabFind
Definition: tabfind.h:52
TBOX::left
int16_t left() const
Definition: rect.h:71
tesseract::kRMSFitScaling
const double kRMSFitScaling
Definition: imagefind.cpp:49
tesseract::ColPartitionGrid
Definition: colpartitiongrid.h:32
STATS::add
void add(int32_t value, int32_t count)
Definition: statistc.cpp:87
BND_COUNT
Definition: blobbox.h:91
tesseract::ImageFind::FindImages
static Pix * FindImages(Pix *pix, DebugPixa *pixa_debug)
Definition: imagefind.cpp:62
TBOX::right
int16_t right() const
Definition: rect.h:78
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::ImageFind::ClipToByte
static uint8_t ClipToByte(double pixel)
Definition: imagefind.cpp:396
tesstrain_utils.type
type
Definition: tesstrain_utils.py:141
tesseract::GridSearch::SetUniqueMode
void SetUniqueMode(bool mode)
Definition: bbgrid.h:253
tesseract::ImageFind::ColorDistanceFromLine
static double ColorDistanceFromLine(const uint8_t *line1, const uint8_t *line2, const uint8_t *point)
Definition: imagefind.cpp:355
BlobNeighbourDir
BlobNeighbourDir
Definition: blobbox.h:86
TBOX::set_bottom
void set_bottom(int y)
Definition: rect.h:67
tesseract::ImageFind::pixNearlyRectangular
static bool pixNearlyRectangular(Pix *pix, double min_fraction, double max_fraction, double max_skew_gradient, int *x_start, int *y_start, int *x_end, int *y_end)
Definition: imagefind.cpp:266
tesseract::GridSearch::RemoveBBox
void RemoveBBox()
Definition: bbgrid.h:866
search
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:202
colpartitiongrid.h
tesseract::ImageFind::ComposeRGB
static uint32_t ComposeRGB(uint32_t r, uint32_t g, uint32_t b)
Definition: imagefind.cpp:389
TBOX::set_left
void set_left(int x)
Definition: rect.h:74
TBOX::x_gap
int x_gap(const TBOX &box) const
Definition: rect.h:224
tesseract::GridSearch::NextFullSearch
BBC * NextFullSearch()
Definition: bbgrid.h:675
TBOX
Definition: rect.h:33