tesseract  4.0.0-1-g2a2b
tablerecog.cpp
Go to the documentation of this file.
1 // File: tablerecog.cpp
3 // Description: Helper class to help structure table areas. Given an bounding
4 // box from TableFinder, the TableRecognizer should give a
5 // StructuredTable (maybe a list in the future) of "good" tables
6 // in that area.
7 // Author: Nicholas Beato
8 // Created: Friday, Aug. 20, 2010
9 //
10 // (C) Copyright 2009, Google Inc.
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 // http://www.apache.org/licenses/LICENSE-2.0
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
22 
23 #ifdef HAVE_CONFIG_H
24 #include "config_auto.h"
25 #endif
26 
27 #include "tablerecog.h"
28 
29 #include <algorithm>
30 
31 namespace tesseract {
32 
33 // The amount of space required between the ColPartitions in 2 columns
34 // of a non-lined table as a multiple of the median width.
35 const double kHorizontalSpacing = 0.30;
36 // The amount of space required between the ColPartitions in 2 rows
37 // of a non-lined table as multiples of the median height.
38 const double kVerticalSpacing = -0.2;
39 // The number of cells that the grid lines may intersect.
40 // See FindCellSplitLocations for explanation.
41 const int kCellSplitRowThreshold = 0;
43 // For "lined tables", the number of required lines. Currently a guess.
46 // Number of columns required, as a fraction of the most columns found.
47 // None of these are tweaked at all.
48 const double kRequiredColumns = 0.7;
49 // The tolerance for comparing margins of potential tables.
50 const double kMarginFactor = 1.1;
51 // The first and last row should be consistent cell height.
52 // This factor is the first and last row cell height max.
53 const double kMaxRowSize = 2.5;
54 // Number of filled columns required to form a strong table row.
55 // For small tables, this is an absolute number.
56 const double kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 };
58  sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1;
59 // For large tables, it is a relative number
60 const double kGoodRowNumberOfColumnsLarge = 0.7;
61 // The amount of area that must be covered in a cell by ColPartitions to
62 // be considered "filled"
63 const double kMinFilledArea = 0.35;
64 
68 
70  : text_grid_(nullptr),
71  line_grid_(nullptr),
72  is_lined_(false),
73  space_above_(0),
74  space_below_(0),
75  space_left_(0),
76  space_right_(0),
77  median_cell_height_(0),
78  median_cell_width_(0),
79  max_text_height_(INT32_MAX) {
80 }
81 
83 }
84 
86  text_grid_ = text_grid;
87 }
89  line_grid_ = line_grid;
90 }
92  max_text_height_ = height;
93 }
95  return is_lined_;
96 }
98  return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1;
99 }
101  return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1;
102 }
104  return row_count() * column_count();
105 }
107  bounding_box_ = box;
108 }
110  return bounding_box_;
111 }
113  return median_cell_height_;
114 }
116  return median_cell_width_;
117 }
118 int StructuredTable::row_height(int row) const {
119  ASSERT_HOST(0 <= row && row < row_count());
120  return cell_y_[row + 1] - cell_y_[row];
121 }
122 int StructuredTable::column_width(int column) const {
123  ASSERT_HOST(0 <= column && column < column_count());
124  return cell_x_[column + 1] - cell_x_[column];
125 }
127  return space_above_;
128 }
130  return space_below_;
131 }
132 
133 // At this point, we know that the lines are contained
134 // by the box (by FindLinesBoundingBox).
135 // So try to find the cell structure and make sure it works out.
136 // The assumption is that all lines span the table. If this
137 // assumption fails, the VerifyLinedTable method will
138 // abort the lined table. The TableRecognizer will fall
139 // back on FindWhitespacedStructure.
141  ClearStructure();
142 
143  // Search for all of the lines in the current box.
144  // Update the cellular structure with the exact lines.
146  box_search.SetUniqueMode(true);
147  box_search.StartRectSearch(bounding_box_);
148  ColPartition* line = nullptr;
149 
150  while ((line = box_search.NextRectSearch()) != nullptr) {
151  if (line->IsHorizontalLine())
152  cell_y_.push_back(line->MidY());
153  if (line->IsVerticalLine())
154  cell_x_.push_back(line->MidX());
155  }
156 
157  // HasSignificantLines should guarantee cells.
158  // Because that code is a different class, just gracefully
159  // return false. This could be an assert.
160  if (cell_x_.length() < 3 || cell_y_.length() < 3)
161  return false;
162 
163  cell_x_.sort();
164  cell_y_.sort();
165 
166  // Remove duplicates that may have occurred due to split lines.
169 
170  // The border should be the extents of line boxes, not middle.
171  cell_x_[0] = bounding_box_.left();
175 
176  // Remove duplicates that may have occurred due to moving the borders.
179 
181  CalculateStats();
183  return is_lined_;
184 }
185 
186 // Finds the cellular structure given a particular box.
188  ClearStructure();
191 
192  if (!VerifyWhitespacedTable()) {
193  return false;
194  } else {
201  CalculateStats();
202  return true;
203  }
204 }
205 
206 // Tests if a partition fits inside the table structure.
207 // Partitions must fully span a grid line in order to intersect it.
208 // This means that a partition does not intersect a line
209 // that it "just" touches. This is mainly because the assumption
210 // throughout the code is that "0" distance is a very very small space.
212  const TBOX& box = part.bounding_box();
213  for (int i = 0; i < cell_x_.length(); ++i)
214  if (box.left() < cell_x_[i] && cell_x_[i] < box.right())
215  return false;
216  for (int i = 0; i < cell_y_.length(); ++i)
217  if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top())
218  return false;
219  return true;
220 }
221 
222 // Checks if a sub-table has multiple data cells filled.
224  return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
225 }
227  return CountFilledCells(row, row, 0, column_count() - 1);
228 }
230  return CountFilledCells(0, row_count() - 1, column, column);
231 }
232 int StructuredTable::CountFilledCells(int row_start, int row_end,
233  int column_start, int column_end) {
234  ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count());
235  ASSERT_HOST(0 <= column_start && column_start <= column_end &&
236  column_end < column_count());
237  int cell_count = 0;
238  TBOX cell_box;
239  for (int row = row_start; row <= row_end; ++row) {
240  cell_box.set_bottom(cell_y_[row]);
241  cell_box.set_top(cell_y_[row + 1]);
242  for (int col = column_start; col <= column_end; ++col) {
243  cell_box.set_left(cell_x_[col]);
244  cell_box.set_right(cell_x_[col + 1]);
245  if (CountPartitions(cell_box) > 0)
246  ++cell_count;
247  }
248  }
249  return cell_count;
250 }
251 
252 // Makes sure that at least one cell in a row has substantial area filled.
253 // This can filter out large whitespace caused by growing tables too far
254 // and page numbers.
256  for (int i = 0; i < column_count(); ++i) {
257  double area_filled = CalculateCellFilledPercentage(row, i);
258  if (area_filled >= kMinFilledArea)
259  return true;
260  }
261  return false;
262 }
263 
264 // Finds the filled area in a cell.
265 // Assume ColPartitions do not overlap for simplicity (even though they do).
267  ASSERT_HOST(0 <= row && row <= row_count());
268  ASSERT_HOST(0 <= column && column <= column_count());
269  const TBOX kCellBox(cell_x_[column], cell_y_[row],
270  cell_x_[column + 1], cell_y_[row + 1]);
271  ASSERT_HOST(!kCellBox.null_box());
272 
274  gsearch.SetUniqueMode(true);
275  gsearch.StartRectSearch(kCellBox);
276  double area_covered = 0;
277  ColPartition* text = nullptr;
278  while ((text = gsearch.NextRectSearch()) != nullptr) {
279  if (text->IsTextType())
280  area_covered += text->bounding_box().intersection(kCellBox).area();
281  }
282  const int32_t current_area = kCellBox.area();
283  if (current_area == 0) {
284  return 1.0;
285  }
286  return std::min(1.0, area_covered / current_area);
287 }
288 
290 #ifndef GRAPHICS_DISABLED
291  window->Brush(ScrollView::NONE);
292  window->Pen(color);
295  for (int i = 0; i < cell_x_.length(); i++) {
296  window->Line(cell_x_[i], bounding_box_.bottom(),
297  cell_x_[i], bounding_box_.top());
298  }
299  for (int i = 0; i < cell_y_.length(); i++) {
300  window->Line(bounding_box_.left(), cell_y_[i],
301  bounding_box_.right(), cell_y_[i]);
302  }
303  window->UpdateWindow();
304 #endif
305 }
306 
307 // Clear structure information.
309  cell_x_.clear();
310  cell_y_.clear();
311  is_lined_ = false;
312  space_above_ = 0;
313  space_below_ = 0;
314  space_left_ = 0;
315  space_right_ = 0;
317  median_cell_width_ = 0;
318 }
319 
320 // When a table has lines, the lines should not intersect any partitions.
321 // The following function makes sure the previous assumption is met.
323  // Function only called when lines exist.
324  ASSERT_HOST(cell_y_.length() >= 2 && cell_x_.length() >= 2);
325  for (int i = 0; i < cell_y_.length(); ++i) {
327  return false;
328  }
329  for (int i = 0; i < cell_x_.length(); ++i) {
331  return false;
332  }
333  return true;
334 }
335 
336 // TODO(nbeato): Could be much better than this.
337 // Examples:
338 // - Caclulate the percentage of filled cells.
339 // - Calculate the average number of ColPartitions per cell.
340 // - Calculate the number of cells per row with partitions.
341 // - Check if ColPartitions in adjacent cells are similar.
342 // - Check that all columns are at least a certain width.
343 // - etc.
345  // criteria for a table, must be at least 2x3 or 3x2
346  return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
347 }
348 
349 // Finds vertical splits in the ColPartitions of text_grid_ by considering
350 // all possible "good" guesses. A good guess is just the left/right sides of
351 // the partitions, since these locations will uniquely define where the
352 // extremal values where the splits can occur. The split happens
353 // in the middle of the two nearest partitions.
355  // Set of the extents of all partitions on the page.
356  GenericVectorEqEq<int> left_sides;
357  GenericVectorEqEq<int> right_sides;
358 
359  // Look at each text partition. We want to find the partitions
360  // that have extremal left/right sides. These will give us a basis
361  // for the table columns.
363  gsearch.SetUniqueMode(true);
365  ColPartition* text = nullptr;
366  while ((text = gsearch.NextRectSearch()) != nullptr) {
367  if (!text->IsTextType())
368  continue;
369 
370  ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
371  int spacing = static_cast<int>(text->median_width() *
372  kHorizontalSpacing / 2.0 + 0.5);
373  left_sides.push_back(text->bounding_box().left() - spacing);
374  right_sides.push_back(text->bounding_box().right() + spacing);
375  }
376  // It causes disaster below, so avoid it!
377  if (left_sides.length() == 0 || right_sides.length() == 0)
378  return;
379 
380  // Since data may be inserted in grid order, we sort the left/right sides.
381  left_sides.sort();
382  right_sides.sort();
383 
384  // At this point, in the "merged list", we expect to have a left side,
385  // followed by either more left sides or a right side. The last number
386  // should be a right side. We find places where the splits occur by looking
387  // for "valleys". If we want to force gap sizes or allow overlap, change
388  // the spacing above. If you want to let lines "slice" partitions as long
389  // as it is infrequent, change the following function.
390  FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold,
391  &cell_x_);
392 }
393 
394 // Finds horizontal splits in the ColPartitions of text_grid_ by considering
395 // all possible "good" guesses. A good guess is just the bottom/top sides of
396 // the partitions, since these locations will uniquely define where the
397 // extremal values where the splits can occur. The split happens
398 // in the middle of the two nearest partitions.
400  // Set of the extents of all partitions on the page.
401  GenericVectorEqEq<int> bottom_sides;
402  GenericVectorEqEq<int> top_sides;
403  // We will be "shrinking" partitions, so keep the min/max around to
404  // make sure the bottom/top lines do not intersect text.
405  int min_bottom = INT32_MAX;
406  int max_top = INT32_MIN;
407 
408  // Look at each text partition. We want to find the partitions
409  // that have extremal bottom/top sides. These will give us a basis
410  // for the table rows. Because the textlines can be skewed and close due
411  // to warping, the height of the partitions is toned down a little bit.
413  gsearch.SetUniqueMode(true);
415  ColPartition* text = nullptr;
416  while ((text = gsearch.NextRectSearch()) != nullptr) {
417  if (!text->IsTextType())
418  continue;
419 
420  ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
421  min_bottom = std::min(min_bottom, static_cast<int>(text->bounding_box().bottom()));
422  max_top = std::max(max_top, static_cast<int>(text->bounding_box().top()));
423 
424  // Ignore "tall" text partitions, as these are usually false positive
425  // vertical text or multiple lines pulled together.
426  if (text->bounding_box().height() > max_text_height_)
427  continue;
428 
429  int spacing = static_cast<int>(text->bounding_box().height() *
430  kVerticalSpacing / 2.0 + 0.5);
431  int bottom = text->bounding_box().bottom() - spacing;
432  int top = text->bounding_box().top() + spacing;
433  // For horizontal text, the factor can be negative. This should
434  // probably cause a warning or failure. I haven't actually checked if
435  // it happens.
436  if (bottom >= top)
437  continue;
438 
439  bottom_sides.push_back(bottom);
440  top_sides.push_back(top);
441  }
442  // It causes disaster below, so avoid it!
443  if (bottom_sides.length() == 0 || top_sides.length() == 0)
444  return;
445 
446  // Since data may be inserted in grid order, we sort the bottom/top sides.
447  bottom_sides.sort();
448  top_sides.sort();
449 
450  // At this point, in the "merged list", we expect to have a bottom side,
451  // followed by either more bottom sides or a top side. The last number
452  // should be a top side. We find places where the splits occur by looking
453  // for "valleys". If we want to force gap sizes or allow overlap, change
454  // the spacing above. If you want to let lines "slice" partitions as long
455  // as it is infrequent, change the following function.
456  FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold,
457  &cell_y_);
458 
459  // Recover the min/max correctly since it was shifted.
460  cell_y_[0] = min_bottom;
461  cell_y_[cell_y_.length() - 1] = max_top;
462 }
463 
465  space_above_ = INT32_MAX;
466  space_below_ = INT32_MAX;
467  space_right_ = INT32_MAX;
468  space_left_ = INT32_MAX;
471 }
472 // Finds the nearest partition in grid to the table
473 // boundaries and updates the margin.
475  int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
476  space_below_ = std::min(space_below_, below);
477  int above = FindVerticalMargin(grid, bounding_box_.top(), false);
478  space_above_ = std::min(space_above_, above);
479  int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
480  space_left_ = std::min(space_left_, left);
481  int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
482  space_right_ = std::min(space_right_, right);
483 }
485  bool decrease) const {
486  ColPartitionGridSearch gsearch(grid);
487  gsearch.SetUniqueMode(true);
489  border);
490  ColPartition* part = nullptr;
491  while ((part = gsearch.NextVerticalSearch(decrease)) != nullptr) {
492  if (!part->IsTextType() && !part->IsHorizontalLine())
493  continue;
494  int distance = decrease ? border - part->bounding_box().top()
495  : part->bounding_box().bottom() - border;
496  if (distance >= 0)
497  return distance;
498  }
499  return INT32_MAX;
500 }
502  bool decrease) const {
503  ColPartitionGridSearch gsearch(grid);
504  gsearch.SetUniqueMode(true);
505  gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
506  ColPartition* part = nullptr;
507  while ((part = gsearch.NextSideSearch(decrease)) != nullptr) {
508  if (!part->IsTextType() && !part->IsVerticalLine())
509  continue;
510  int distance = decrease ? border - part->bounding_box().right()
511  : part->bounding_box().left() - border;
512  if (distance >= 0)
513  return distance;
514  }
515  return INT32_MAX;
516 }
517 
519  const int kMaxCellHeight = 1000;
520  const int kMaxCellWidth = 1000;
521  STATS height_stats(0, kMaxCellHeight + 1);
522  STATS width_stats(0, kMaxCellWidth + 1);
523 
524  for (int i = 0; i < row_count(); ++i)
525  height_stats.add(row_height(i), column_count());
526  for (int i = 0; i < column_count(); ++i)
527  width_stats.add(column_width(i), row_count());
528 
529  median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
530  median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
531 }
532 
533 // Looks for grid lines near the current bounding box and
534 // grows the bounding box to include them if no intersections
535 // will occur as a result. This is necessary because the margins
536 // are calculated relative to the closest line/text. If the
537 // line isn't absorbed, the margin will be the distance to the line.
540  gsearch.SetUniqueMode(true);
541 
542  // Is the closest line above good? Loop multiple times for tables with
543  // multi-line (sometimes 2) borders. Limit the number of lines by
544  // making sure they stay within a table cell or so.
545  ColPartition* line = nullptr;
547  bounding_box_.top());
548  while ((line = gsearch.NextVerticalSearch(false)) != nullptr) {
549  if (!line->IsHorizontalLine())
550  break;
551  TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1,
552  bounding_box_.right(), line->MidY());
553  if (text_search.height() > median_cell_height_ * 2)
554  break;
555  if (CountPartitions(text_search) > 0)
556  break;
557  bounding_box_.set_top(line->MidY());
558  }
559  // As above, is the closest line below good?
560  line = nullptr;
563  while ((line = gsearch.NextVerticalSearch(true)) != nullptr) {
564  if (!line->IsHorizontalLine())
565  break;
566  TBOX text_search(bounding_box_.left(), line->MidY(),
568  if (text_search.height() > median_cell_height_ * 2)
569  break;
570  if (CountPartitions(text_search) > 0)
571  break;
572  bounding_box_.set_bottom(line->MidY());
573  }
574  // TODO(nbeato): vertical lines
575 }
576 
577 
578 // This function will find all "0 valleys" (of any length) given two
579 // arrays. The arrays are the mins and maxes of partitions (either
580 // left and right or bottom and top). Since the min/max lists are generated
581 // with pairs of increasing integers, we can make some assumptions in
582 // the function about ordering of the overall list, which are shown in the
583 // asserts.
584 // The algorithm works as follows:
585 // While there are numbers to process, take the smallest number.
586 // If it is from the min_list, increment the "hill" counter.
587 // Otherwise, decrement the "hill" counter.
588 // In the process of doing this, keep track of "crossing" the
589 // desired height.
590 // The first/last items are extremal values of the list and known.
591 // NOTE: This function assumes the lists are sorted!
593  const GenericVector<int>& max_list,
594  int max_merged,
595  GenericVector<int>* locations) {
596  locations->clear();
597  ASSERT_HOST(min_list.length() == max_list.length());
598  if (min_list.length() == 0)
599  return;
600  ASSERT_HOST(min_list.get(0) < max_list.get(0));
601  ASSERT_HOST(min_list.get(min_list.length() - 1) <
602  max_list.get(max_list.length() - 1));
603 
604  locations->push_back(min_list.get(0));
605  int min_index = 0;
606  int max_index = 0;
607  int stacked_partitions = 0;
608  int last_cross_position = INT32_MAX;
609  // max_index will expire after min_index.
610  // However, we can't "increase" the hill size if min_index expired.
611  // So finish processing when min_index expires.
612  while (min_index < min_list.length()) {
613  // Increase the hill count.
614  if (min_list[min_index] < max_list[max_index]) {
615  ++stacked_partitions;
616  if (last_cross_position != INT32_MAX &&
617  stacked_partitions > max_merged) {
618  int mid = (last_cross_position + min_list[min_index]) / 2;
619  locations->push_back(mid);
620  last_cross_position = INT32_MAX;
621  }
622  ++min_index;
623  } else {
624  // Decrease the hill count.
625  --stacked_partitions;
626  if (last_cross_position == INT32_MAX &&
627  stacked_partitions <= max_merged) {
628  last_cross_position = max_list[max_index];
629  }
630  ++max_index;
631  }
632  }
633  locations->push_back(max_list.get(max_list.length() - 1));
634 }
635 
636 // Counts the number of partitions in the table
637 // box that intersection the given x value.
639  int count = 0;
640  // Make a small box to keep the search time down.
641  const int kGridSize = text_grid_->gridsize();
642  TBOX vertical_box = bounding_box_;
643  vertical_box.set_left(x - kGridSize);
644  vertical_box.set_right(x + kGridSize);
645 
647  gsearch.SetUniqueMode(true);
648  gsearch.StartRectSearch(vertical_box);
649  ColPartition* text = nullptr;
650  while ((text = gsearch.NextRectSearch()) != nullptr) {
651  if (!text->IsTextType())
652  continue;
653  const TBOX& box = text->bounding_box();
654  if (box.left() < x && x < box.right())
655  ++count;
656  }
657  return count;
658 }
659 
660 // Counts the number of partitions in the table
661 // box that intersection the given y value.
663  int count = 0;
664  // Make a small box to keep the search time down.
665  const int kGridSize = text_grid_->gridsize();
666  TBOX horizontal_box = bounding_box_;
667  horizontal_box.set_bottom(y - kGridSize);
668  horizontal_box.set_top(y + kGridSize);
669 
671  gsearch.SetUniqueMode(true);
672  gsearch.StartRectSearch(horizontal_box);
673  ColPartition* text = nullptr;
674  while ((text = gsearch.NextRectSearch()) != nullptr) {
675  if (!text->IsTextType())
676  continue;
677 
678  const TBOX& box = text->bounding_box();
679  if (box.bottom() < y && y < box.top())
680  ++count;
681  }
682  return count;
683 }
684 
685 // Counts how many text partitions are in this box.
686 // This is used to count partitons in cells, as that can indicate
687 // how "strong" a potential table row/column (or even full table) actually is.
690  gsearch.SetUniqueMode(true);
691  gsearch.StartRectSearch(box);
692  int count = 0;
693  ColPartition* text = nullptr;
694  while ((text = gsearch.NextRectSearch()) != nullptr) {
695  if (text->IsTextType())
696  ++count;
697  }
698  return count;
699 }
700 
704 
706  : text_grid_(nullptr),
707  line_grid_(nullptr),
708  min_height_(0),
709  min_width_(0),
710  max_text_height_(INT32_MAX) {
711 }
712 
714 }
715 
717 }
718 
720  text_grid_ = text_grid;
721 }
723  line_grid_ = line_grid;
724 }
726  min_height_ = height;
727 }
729  min_width_ = width;
730 }
732  max_text_height_ = height;
733 }
734 
736  StructuredTable* table = new StructuredTable();
737  table->Init();
738  table->set_text_grid(text_grid_);
739  table->set_line_grid(line_grid_);
741 
742  // Try to solve this simple case, a table with *both*
743  // vertical and horizontal lines.
744  if (RecognizeLinedTable(guess, table))
745  return table;
746 
747  // Fallback to whitespace if that failed.
748  // TODO(nbeato): Break this apart to take advantage of horizontal
749  // lines or vertical lines when present.
750  if (RecognizeWhitespacedTable(guess, table))
751  return table;
752 
753  // No table found...
754  delete table;
755  return nullptr;
756 }
757 
759  StructuredTable* table) {
760  if (!HasSignificantLines(guess_box))
761  return false;
762  TBOX line_bound = guess_box;
763  if (!FindLinesBoundingBox(&line_bound))
764  return false;
765  table->set_bounding_box(line_bound);
766  return table->FindLinedStructure();
767 }
768 
769 // Quick implementation. Just count the number of lines in the box.
770 // A better implementation would counter intersections and look for connected
771 // components. It could even go as far as finding similar length lines.
772 // To account for these possible issues, the VerifyLinedTableCells function
773 // will reject lined tables that cause intersections with text on the page.
774 // TODO(nbeato): look for "better" lines
777  box_search.SetUniqueMode(true);
778  box_search.StartRectSearch(guess);
779  ColPartition* line = nullptr;
780  int vertical_count = 0;
781  int horizontal_count = 0;
782 
783  while ((line = box_search.NextRectSearch()) != nullptr) {
784  if (line->IsHorizontalLine())
785  ++horizontal_count;
786  if (line->IsVerticalLine())
787  ++vertical_count;
788  }
789 
790  return vertical_count >= kLinedTableMinVerticalLines &&
791  horizontal_count >= kLinedTableMinHorizontalLines;
792 }
793 
794 // Given a bounding box with a bunch of horizontal / vertical lines,
795 // we just find the extents of all of these lines iteratively.
796 // The box will be at least as large as guess. This
797 // could possibly be a bad assumption.
798 // It is guaranteed to halt in at least O(n * gridarea) where n
799 // is the number of lines.
800 // The assumption is that growing the box iteratively will add lines
801 // several times, but eventually we'll find the extents.
802 //
803 // For tables, the approach is a bit aggressive, a single line (which could be
804 // noise or a column ruling) can destroy the table inside.
805 //
806 // TODO(nbeato): This is a quick first implementation.
807 // A better implementation would actually look for consistency
808 // in extents of the lines and find the extents using lines
809 // that clearly describe the table. This would allow the
810 // lines to "vote" for height/width. An approach like
811 // this would solve issues with page layout rulings.
812 // I haven't looked for these issues yet, so I can't even
813 // say they happen confidently.
815  // The first iteration will tell us if there are lines
816  // present and shrink the box to a minimal iterative size.
817  if (!FindLinesBoundingBoxIteration(bounding_box))
818  return false;
819 
820  // Keep growing until the area of the table stabilizes.
821  // The box can only get bigger, increasing area.
822  bool changed = true;
823  while (changed) {
824  changed = false;
825  int old_area = bounding_box->area();
826  bool check = FindLinesBoundingBoxIteration(bounding_box);
827  // At this point, the function will return true.
828  ASSERT_HOST(check);
829  ASSERT_HOST(bounding_box->area() >= old_area);
830  changed = (bounding_box->area() > old_area);
831  }
832 
833  return true;
834 }
835 
837  // Search for all of the lines in the current box, keeping track of extents.
839  box_search.SetUniqueMode(true);
840  box_search.StartRectSearch(*bounding_box);
841  ColPartition* line = nullptr;
842  bool first_line = true;
843 
844  while ((line = box_search.NextRectSearch()) != nullptr) {
845  if (line->IsLineType()) {
846  if (first_line) {
847  // The first iteration can shrink the box.
848  *bounding_box = line->bounding_box();
849  first_line = false;
850  } else {
851  *bounding_box += line->bounding_box();
852  }
853  }
854  }
855  return !first_line;
856 }
857 
858 // The goal of this function is to move the table boundaries around and find
859 // a table that maximizes the whitespace around the table while maximizing
860 // the cellular structure. As a result, it gets confused by headers, footers,
861 // and merged columns (text that crosses columns). There is a tolerance
862 // that allows a few partitions to count towards potential cell merges.
863 // It's the max_merged parameter to FindPartitionLocations.
864 // It can work, but it needs some false positive remove on boundaries.
865 // For now, the grid structure must not intersect any partitions.
866 // Also, small tolerance is added to the horizontal lines for tightly packed
867 // tables. The tolerance is added by adjusting the bounding boxes of the
868 // partitions (in FindHorizontalPartitions). The current implementation
869 // only adjusts the vertical extents of the table.
870 //
871 // Also note. This was hacked at a lot. It could probably use some
872 // more hacking at to find a good set of border conditions and then a
873 // nice clean up.
875  StructuredTable* table) {
876  TBOX best_box = guess_box; // Best borders known.
877  int best_below = 0; // Margin size above best table.
878  int best_above = 0; // Margin size below best table.
879  TBOX adjusted = guess_box; // The search box.
880 
881  // We assume that the guess box is somewhat accurate, so we don't allow
882  // the adjusted border to pass half of the guessed area. This prevents
883  // "negative" tables from forming.
884  const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
885  // Keeps track of the most columns in an accepted table. The resulting table
886  // may be less than the max, but we don't want to stray too far.
887  int best_cols = 0;
888  // Make sure we find a good border.
889  bool found_good_border = false;
890 
891  // Find the bottom of the table by trying a few different locations. For
892  // each location, the top, left, and right are fixed. We start the search
893  // in a smaller table to favor best_cols getting a good estimate sooner.
894  int last_bottom = INT32_MAX;
895  int bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
896  kMidGuessY - min_height_ / 2, true);
897  int top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
898  kMidGuessY + min_height_ / 2, false);
899  adjusted.set_top(top);
900 
901  // Headers/footers can be spaced far from everything.
902  // Make sure that the space below is greater than the space above
903  // the lowest row.
904  int previous_below = 0;
905  const int kMaxChances = 10;
906  int chances = kMaxChances;
907  while (bottom != last_bottom) {
908  adjusted.set_bottom(bottom);
909 
910  if (adjusted.height() >= min_height_) {
911  // Try to fit the grid on the current box. We give it a chance
912  // if the number of columns didn't significantly drop.
913  table->set_bounding_box(adjusted);
914  if (table->FindWhitespacedStructure() &&
915  table->column_count() >= best_cols * kRequiredColumns) {
916  if (false && IsWeakTableRow(table, 0)) {
917  // Currently buggy, but was looking promising so disabled.
918  --chances;
919  } else {
920  // We favor 2 things,
921  // 1- Adding rows that have partitioned data.
922  // 2- Better margins (to find header/footer).
923  // For better tables, we just look for multiple cells in the
924  // bottom row with data in them.
925  // For margins, the space below the last row should
926  // be better than a table with the last row removed.
927  chances = kMaxChances;
928  double max_row_height = kMaxRowSize * table->median_cell_height();
929  if ((table->space_below() * kMarginFactor >= best_below &&
930  table->space_below() >= previous_below) ||
931  (table->CountFilledCellsInRow(0) > 1 &&
932  table->row_height(0) < max_row_height)) {
933  best_box.set_bottom(bottom);
934  best_below = table->space_below();
935  best_cols = std::max(table->column_count(), best_cols);
936  found_good_border = true;
937  }
938  }
939  previous_below = table->space_below();
940  } else {
941  --chances;
942  }
943  }
944  if (chances <= 0)
945  break;
946 
947  last_bottom = bottom;
948  bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
949  last_bottom, true);
950  }
951  if (!found_good_border)
952  return false;
953 
954  // TODO(nbeato) comments: follow modified code above... put it in a function!
955  found_good_border = false;
956  int last_top = INT32_MIN;
957  top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
958  kMidGuessY + min_height_ / 2, false);
959  int previous_above = 0;
960  chances = kMaxChances;
961 
962  adjusted.set_bottom(best_box.bottom());
963  while (last_top != top) {
964  adjusted.set_top(top);
965  if (adjusted.height() >= min_height_) {
966  table->set_bounding_box(adjusted);
967  if (table->FindWhitespacedStructure() &&
968  table->column_count() >= best_cols * kRequiredColumns) {
969  int last_row = table->row_count() - 1;
970  if (false && IsWeakTableRow(table, last_row)) {
971  // Currently buggy, but was looking promising so disabled.
972  --chances;
973  } else {
974  chances = kMaxChances;
975  double max_row_height = kMaxRowSize * table->median_cell_height();
976  if ((table->space_above() * kMarginFactor >= best_above &&
977  table->space_above() >= previous_above) ||
978  (table->CountFilledCellsInRow(last_row) > 1 &&
979  table->row_height(last_row) < max_row_height)) {
980  best_box.set_top(top);
981  best_above = table->space_above();
982  best_cols = std::max(table->column_count(), best_cols);
983  found_good_border = true;
984  }
985  }
986  previous_above = table->space_above();
987  } else {
988  --chances;
989  }
990  }
991  if (chances <= 0)
992  break;
993 
994  last_top = top;
995  top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
996  last_top, false);
997  }
998 
999  if (!found_good_border)
1000  return false;
1001 
1002  // If we get here, this shouldn't happen. It can be an assert, but
1003  // I haven't tested it enough to make it crash things.
1004  if (best_box.null_box())
1005  return false;
1006 
1007  // Given the best locations, fit the box to those locations.
1008  table->set_bounding_box(best_box);
1009  return table->FindWhitespacedStructure();
1010 }
1011 
1012 // Finds the closest value to y that can safely cause a horizontal
1013 // split in the partitions.
1014 // This function has been buggy and not as reliable as I would've
1015 // liked. I suggest finding all of the splits using the
1016 // FindPartitionLocations once and then just keeping the results
1017 // of that function cached somewhere.
1018 int TableRecognizer::NextHorizontalSplit(int left, int right, int y,
1019  bool top_to_bottom) {
1021  gsearch.SetUniqueMode(true);
1022  gsearch.StartVerticalSearch(left, right, y);
1023  ColPartition* text = nullptr;
1024  int last_y = y;
1025  while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != nullptr) {
1026  if (!text->IsTextType() || !text->IsHorizontalType())
1027  continue;
1028  if (text->bounding_box().height() > max_text_height_)
1029  continue;
1030 
1031  const TBOX& text_box = text->bounding_box();
1032  if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {
1033  last_y = std::min(last_y, static_cast<int>(text_box.bottom()));
1034  continue;
1035  }
1036  if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {
1037  last_y = std::max(last_y, static_cast<int>(text_box.top()));
1038  continue;
1039  }
1040 
1041  return last_y;
1042  }
1043  // If none is found, we at least want to preserve the min/max,
1044  // which defines the overlap of y with the last partition in the grid.
1045  return last_y;
1046 }
1047 
1048 // Code is buggy right now. It is disabled in the calling function.
1049 // It seems like sometimes the row that is passed in is not correct
1050 // sometimes (like a phantom row is introduced). There's something going
1051 // on in the cell_y_ data member before this is called... not certain.
1053  if (!table->VerifyRowFilled(row))
1054  return false;
1055 
1056  double threshold = 0.0;
1058  threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;
1059  else
1060  threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];
1061 
1062  return table->CountFilledCellsInRow(row) < threshold;
1063 }
1064 
1065 } // namespace tesseract
bool IsHorizontalType() const
Definition: colpartition.h:446
int gridsize() const
Definition: bbgrid.h:64
ColPartitionGrid * text_grid_
Definition: tablerecog.h:367
const int kLinedTableMinVerticalLines
Definition: tablerecog.cpp:44
void StartSideSearch(int x, int ymin, int ymax)
Definition: bbgrid.h:748
const double kMinFilledArea
Definition: tablerecog.cpp:63
void set_top(int y)
Definition: rect.h:61
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
bool null_box() const
Definition: rect.h:50
double CalculateCellFilledPercentage(int row, int column)
Definition: tablerecog.cpp:266
void set_bottom(int y)
Definition: rect.h:68
static bool IsWeakTableRow(StructuredTable *table, int row)
int count(LIST var_list)
Definition: oldlist.cpp:98
int CountHorizontalIntersections(int y)
Definition: tablerecog.cpp:662
Definition: rect.h:34
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
bool FindLinesBoundingBoxIteration(TBOX *bounding_box)
Definition: tablerecog.cpp:836
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int CountFilledCellsInRow(int row)
Definition: tablerecog.cpp:226
Definition: statistc.h:33
int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const
Definition: tablerecog.cpp:501
StructuredTable * RecognizeTable(const TBOX &guess_box)
Definition: tablerecog.cpp:735
void set_max_text_height(int height)
Definition: tablerecog.cpp:731
const TBOX & bounding_box() const
Definition: tablerecog.cpp:109
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:722
void set_bounding_box(const TBOX &box)
Definition: tablerecog.cpp:106
void StartRectSearch(const TBOX &rect)
Definition: bbgrid.h:832
void set_right(int x)
Definition: rect.h:82
int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const
Definition: tablerecog.cpp:484
const double kHorizontalSpacing
Definition: tablerecog.cpp:35
const double kMaxRowSize
Definition: tablerecog.cpp:53
void compact_sorted()
void Display(ScrollView *window, ScrollView::Color color)
Definition: tablerecog.cpp:289
int16_t left() const
Definition: rect.h:72
bool HasSignificantLines(const TBOX &guess)
Definition: tablerecog.cpp:775
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:85
int16_t top() const
Definition: rect.h:58
T & get(int index) const
BBC * NextSideSearch(bool right_to_left)
Definition: bbgrid.h:763
double median() const
Definition: statistc.cpp:238
void set_max_text_height(int height)
Definition: tablerecog.cpp:91
BBC * NextRectSearch()
Definition: bbgrid.h:844
static void FindCellSplitLocations(const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)
Definition: tablerecog.cpp:592
const double kGoodRowNumberOfColumnsSmall[]
Definition: tablerecog.cpp:56
const int kCellSplitColumnThreshold
Definition: tablerecog.cpp:42
void UpdateMargins(ColPartitionGrid *grid)
Definition: tablerecog.cpp:474
int length() const
Definition: genericvector.h:85
bool VerifyRowFilled(int row)
Definition: tablerecog.cpp:255
void set_min_height(int height)
Definition: tablerecog.cpp:725
ColPartitionGrid * line_grid_
Definition: tablerecog.h:368
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:790
bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:874
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:88
int32_t area() const
Definition: rect.h:122
const int kLinedTableMinHorizontalLines
Definition: tablerecog.cpp:45
const int kGoodRowNumberOfColumnsSmallSize
Definition: tablerecog.cpp:57
void add(int32_t value, int32_t count)
Definition: statistc.cpp:100
int push_back(T object)
void set_left(int x)
Definition: rect.h:75
const TBOX & bounding_box() const
Definition: colpartition.h:110
const double kVerticalSpacing
Definition: tablerecog.cpp:38
bool IsVerticalLine() const
Definition: colpartition.h:455
void UpdateWindow()
Definition: scrollview.cpp:706
const double kRequiredColumns
Definition: tablerecog.cpp:48
int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom)
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
int CountFilledCellsInColumn(int column)
Definition: tablerecog.cpp:229
int16_t right() const
Definition: rect.h:79
const int kCellSplitRowThreshold
Definition: tablerecog.cpp:41
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:602
bool FindLinesBoundingBox(TBOX *bounding_box)
Definition: tablerecog.cpp:814
void set_min_width(int width)
Definition: tablerecog.cpp:728
int row_height(int row) const
Definition: tablerecog.cpp:118
const double kGoodRowNumberOfColumnsLarge
Definition: tablerecog.cpp:60
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:804
bool IsHorizontalLine() const
Definition: colpartition.h:460
int column_width(int column) const
Definition: tablerecog.cpp:122
int CountVerticalIntersections(int x)
Definition: tablerecog.cpp:638
void SetUniqueMode(bool mode)
Definition: bbgrid.h:255
void Pen(Color color)
Definition: scrollview.cpp:722
int16_t bottom() const
Definition: rect.h:65
const double kMarginFactor
Definition: tablerecog.cpp:50
bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:758
int16_t height() const
Definition: rect.h:108
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:534
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:719
bool DoesPartitionFit(const ColPartition &part) const
Definition: tablerecog.cpp:211
int CountPartitions(const TBOX &box)
Definition: tablerecog.cpp:688
#define ASSERT_HOST(x)
Definition: errcode.h:84
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238
void Brush(Color color)
Definition: scrollview.cpp:728