tesseract  4.0.0-1-g2a2b
tesseract::StructuredTable Class Reference

#include <tablerecog.h>

Public Member Functions

 StructuredTable ()
 
 ~StructuredTable ()=default
 
void Init ()
 
void set_text_grid (ColPartitionGrid *text)
 
void set_line_grid (ColPartitionGrid *lines)
 
void set_max_text_height (int height)
 
bool is_lined () const
 
int row_count () const
 
int column_count () const
 
int cell_count () const
 
void set_bounding_box (const TBOX &box)
 
const TBOXbounding_box () const
 
int median_cell_height ()
 
int median_cell_width ()
 
int row_height (int row) const
 
int column_width (int column) const
 
int space_above () const
 
int space_below () const
 
bool FindLinedStructure ()
 
bool FindWhitespacedStructure ()
 
bool DoesPartitionFit (const ColPartition &part) const
 
int CountFilledCells ()
 
int CountFilledCellsInRow (int row)
 
int CountFilledCellsInColumn (int column)
 
int CountFilledCells (int row_start, int row_end, int column_start, int column_end)
 
bool VerifyRowFilled (int row)
 
double CalculateCellFilledPercentage (int row, int column)
 
void Display (ScrollView *window, ScrollView::Color color)
 

Protected Member Functions

void ClearStructure ()
 
bool VerifyLinedTableCells ()
 
bool VerifyWhitespacedTable ()
 
void FindWhitespacedColumns ()
 
void FindWhitespacedRows ()
 
void CalculateMargins ()
 
void UpdateMargins (ColPartitionGrid *grid)
 
int FindVerticalMargin (ColPartitionGrid *grid, int start_x, bool decrease) const
 
int FindHorizontalMargin (ColPartitionGrid *grid, int start_y, bool decrease) const
 
void CalculateStats ()
 
void AbsorbNearbyLines ()
 
int CountVerticalIntersections (int x)
 
int CountHorizontalIntersections (int y)
 
int CountPartitions (const TBOX &box)
 

Static Protected Member Functions

static void FindCellSplitLocations (const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)
 

Protected Attributes

ColPartitionGridtext_grid_
 
ColPartitionGridline_grid_
 
TBOX bounding_box_
 
GenericVectorEqEq< int > cell_x_
 
GenericVectorEqEq< int > cell_y_
 
bool is_lined_
 
int space_above_
 
int space_below_
 
int space_left_
 
int space_right_
 
int median_cell_height_
 
int median_cell_width_
 
int max_text_height_
 

Detailed Description

Definition at line 72 of file tablerecog.h.

Constructor & Destructor Documentation

◆ StructuredTable()

tesseract::StructuredTable::StructuredTable ( )

Definition at line 69 of file tablerecog.cpp.

◆ ~StructuredTable()

tesseract::StructuredTable::~StructuredTable ( )
default

Member Function Documentation

◆ AbsorbNearbyLines()

void tesseract::StructuredTable::AbsorbNearbyLines ( )
protected

Definition at line 538 of file tablerecog.cpp.

538  {
540  gsearch.SetUniqueMode(true);
541 
542  // Is the closest line above good? Loop multiple times for tables with
543  // multi-line (sometimes 2) borders. Limit the number of lines by
544  // making sure they stay within a table cell or so.
545  ColPartition* line = nullptr;
546  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
547  bounding_box_.top());
548  while ((line = gsearch.NextVerticalSearch(false)) != nullptr) {
549  if (!line->IsHorizontalLine())
550  break;
551  TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1,
552  bounding_box_.right(), line->MidY());
553  if (text_search.height() > median_cell_height_ * 2)
554  break;
555  if (CountPartitions(text_search) > 0)
556  break;
557  bounding_box_.set_top(line->MidY());
558  }
559  // As above, is the closest line below good?
560  line = nullptr;
561  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
563  while ((line = gsearch.NextVerticalSearch(true)) != nullptr) {
564  if (!line->IsHorizontalLine())
565  break;
566  TBOX text_search(bounding_box_.left(), line->MidY(),
568  if (text_search.height() > median_cell_height_ * 2)
569  break;
570  if (CountPartitions(text_search) > 0)
571  break;
572  bounding_box_.set_bottom(line->MidY());
573  }
574  // TODO(nbeato): vertical lines
575 }
void set_top(int y)
Definition: rect.h:61
void set_bottom(int y)
Definition: rect.h:68
Definition: rect.h:34
int16_t left() const
Definition: rect.h:72
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
int16_t top() const
Definition: rect.h:58
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
int CountPartitions(const TBOX &box)
Definition: tablerecog.cpp:688
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238

◆ bounding_box()

const TBOX & tesseract::StructuredTable::bounding_box ( ) const

Definition at line 109 of file tablerecog.cpp.

109  {
110  return bounding_box_;
111 }

◆ CalculateCellFilledPercentage()

double tesseract::StructuredTable::CalculateCellFilledPercentage ( int  row,
int  column 
)

Definition at line 266 of file tablerecog.cpp.

266  {
267  ASSERT_HOST(0 <= row && row <= row_count());
268  ASSERT_HOST(0 <= column && column <= column_count());
269  const TBOX kCellBox(cell_x_[column], cell_y_[row],
270  cell_x_[column + 1], cell_y_[row + 1]);
271  ASSERT_HOST(!kCellBox.null_box());
272 
274  gsearch.SetUniqueMode(true);
275  gsearch.StartRectSearch(kCellBox);
276  double area_covered = 0;
277  ColPartition* text = nullptr;
278  while ((text = gsearch.NextRectSearch()) != nullptr) {
279  if (text->IsTextType())
280  area_covered += text->bounding_box().intersection(kCellBox).area();
281  }
282  const int32_t current_area = kCellBox.area();
283  if (current_area == 0) {
284  return 1.0;
285  }
286  return std::min(1.0, area_covered / current_area);
287 }
Definition: rect.h:34
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ CalculateMargins()

void tesseract::StructuredTable::CalculateMargins ( )
protected

Definition at line 464 of file tablerecog.cpp.

464  {
465  space_above_ = INT32_MAX;
466  space_below_ = INT32_MAX;
467  space_right_ = INT32_MAX;
468  space_left_ = INT32_MAX;
471 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
void UpdateMargins(ColPartitionGrid *grid)
Definition: tablerecog.cpp:474
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238

◆ CalculateStats()

void tesseract::StructuredTable::CalculateStats ( )
protected

Definition at line 518 of file tablerecog.cpp.

518  {
519  const int kMaxCellHeight = 1000;
520  const int kMaxCellWidth = 1000;
521  STATS height_stats(0, kMaxCellHeight + 1);
522  STATS width_stats(0, kMaxCellWidth + 1);
523 
524  for (int i = 0; i < row_count(); ++i)
525  height_stats.add(row_height(i), column_count());
526  for (int i = 0; i < column_count(); ++i)
527  width_stats.add(column_width(i), row_count());
528 
529  median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
530  median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
531 }
Definition: statistc.h:33
int row_height(int row) const
Definition: tablerecog.cpp:118
int column_width(int column) const
Definition: tablerecog.cpp:122

◆ cell_count()

int tesseract::StructuredTable::cell_count ( ) const

Definition at line 103 of file tablerecog.cpp.

103  {
104  return row_count() * column_count();
105 }

◆ ClearStructure()

void tesseract::StructuredTable::ClearStructure ( )
protected

Definition at line 308 of file tablerecog.cpp.

308  {
309  cell_x_.clear();
310  cell_y_.clear();
311  is_lined_ = false;
312  space_above_ = 0;
313  space_below_ = 0;
314  space_left_ = 0;
315  space_right_ = 0;
317  median_cell_width_ = 0;
318 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244

◆ column_count()

int tesseract::StructuredTable::column_count ( ) const

Definition at line 100 of file tablerecog.cpp.

100  {
101  return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1;
102 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int length() const
Definition: genericvector.h:85

◆ column_width()

int tesseract::StructuredTable::column_width ( int  column) const

Definition at line 122 of file tablerecog.cpp.

122  {
123  ASSERT_HOST(0 <= column && column < column_count());
124  return cell_x_[column + 1] - cell_x_[column];
125 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ CountFilledCells() [1/2]

int tesseract::StructuredTable::CountFilledCells ( )

Definition at line 223 of file tablerecog.cpp.

223  {
224  return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
225 }

◆ CountFilledCells() [2/2]

int tesseract::StructuredTable::CountFilledCells ( int  row_start,
int  row_end,
int  column_start,
int  column_end 
)

Definition at line 232 of file tablerecog.cpp.

233  {
234  ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count());
235  ASSERT_HOST(0 <= column_start && column_start <= column_end &&
236  column_end < column_count());
237  int cell_count = 0;
238  TBOX cell_box;
239  for (int row = row_start; row <= row_end; ++row) {
240  cell_box.set_bottom(cell_y_[row]);
241  cell_box.set_top(cell_y_[row + 1]);
242  for (int col = column_start; col <= column_end; ++col) {
243  cell_box.set_left(cell_x_[col]);
244  cell_box.set_right(cell_x_[col + 1]);
245  if (CountPartitions(cell_box) > 0)
246  ++cell_count;
247  }
248  }
249  return cell_count;
250 }
void set_top(int y)
Definition: rect.h:61
void set_bottom(int y)
Definition: rect.h:68
Definition: rect.h:34
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
void set_right(int x)
Definition: rect.h:82
void set_left(int x)
Definition: rect.h:75
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
int CountPartitions(const TBOX &box)
Definition: tablerecog.cpp:688
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ CountFilledCellsInColumn()

int tesseract::StructuredTable::CountFilledCellsInColumn ( int  column)

Definition at line 229 of file tablerecog.cpp.

229  {
230  return CountFilledCells(0, row_count() - 1, column, column);
231 }

◆ CountFilledCellsInRow()

int tesseract::StructuredTable::CountFilledCellsInRow ( int  row)

Definition at line 226 of file tablerecog.cpp.

226  {
227  return CountFilledCells(row, row, 0, column_count() - 1);
228 }

◆ CountHorizontalIntersections()

int tesseract::StructuredTable::CountHorizontalIntersections ( int  y)
protected

Definition at line 662 of file tablerecog.cpp.

662  {
663  int count = 0;
664  // Make a small box to keep the search time down.
665  const int kGridSize = text_grid_->gridsize();
666  TBOX horizontal_box = bounding_box_;
667  horizontal_box.set_bottom(y - kGridSize);
668  horizontal_box.set_top(y + kGridSize);
669 
671  gsearch.SetUniqueMode(true);
672  gsearch.StartRectSearch(horizontal_box);
673  ColPartition* text = nullptr;
674  while ((text = gsearch.NextRectSearch()) != nullptr) {
675  if (!text->IsTextType())
676  continue;
677 
678  const TBOX& box = text->bounding_box();
679  if (box.bottom() < y && y < box.top())
680  ++count;
681  }
682  return count;
683 }
int gridsize() const
Definition: bbgrid.h:64
void set_top(int y)
Definition: rect.h:61
void set_bottom(int y)
Definition: rect.h:68
int count(LIST var_list)
Definition: oldlist.cpp:98
Definition: rect.h:34
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
int16_t top() const
Definition: rect.h:58
int16_t bottom() const
Definition: rect.h:65

◆ CountPartitions()

int tesseract::StructuredTable::CountPartitions ( const TBOX box)
protected

Definition at line 688 of file tablerecog.cpp.

688  {
690  gsearch.SetUniqueMode(true);
691  gsearch.StartRectSearch(box);
692  int count = 0;
693  ColPartition* text = nullptr;
694  while ((text = gsearch.NextRectSearch()) != nullptr) {
695  if (text->IsTextType())
696  ++count;
697  }
698  return count;
699 }
int count(LIST var_list)
Definition: oldlist.cpp:98
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936

◆ CountVerticalIntersections()

int tesseract::StructuredTable::CountVerticalIntersections ( int  x)
protected

Definition at line 638 of file tablerecog.cpp.

638  {
639  int count = 0;
640  // Make a small box to keep the search time down.
641  const int kGridSize = text_grid_->gridsize();
642  TBOX vertical_box = bounding_box_;
643  vertical_box.set_left(x - kGridSize);
644  vertical_box.set_right(x + kGridSize);
645 
647  gsearch.SetUniqueMode(true);
648  gsearch.StartRectSearch(vertical_box);
649  ColPartition* text = nullptr;
650  while ((text = gsearch.NextRectSearch()) != nullptr) {
651  if (!text->IsTextType())
652  continue;
653  const TBOX& box = text->bounding_box();
654  if (box.left() < x && x < box.right())
655  ++count;
656  }
657  return count;
658 }
int gridsize() const
Definition: bbgrid.h:64
int count(LIST var_list)
Definition: oldlist.cpp:98
Definition: rect.h:34
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
void set_right(int x)
Definition: rect.h:82
int16_t left() const
Definition: rect.h:72
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
void set_left(int x)
Definition: rect.h:75
int16_t right() const
Definition: rect.h:79

◆ Display()

void tesseract::StructuredTable::Display ( ScrollView window,
ScrollView::Color  color 
)

Definition at line 289 of file tablerecog.cpp.

289  {
290 #ifndef GRAPHICS_DISABLED
291  window->Brush(ScrollView::NONE);
292  window->Pen(color);
295  for (int i = 0; i < cell_x_.length(); i++) {
296  window->Line(cell_x_[i], bounding_box_.bottom(),
297  cell_x_[i], bounding_box_.top());
298  }
299  for (int i = 0; i < cell_y_.length(); i++) {
300  window->Line(bounding_box_.left(), cell_y_[i],
301  bounding_box_.right(), cell_y_[i]);
302  }
303  window->UpdateWindow();
304 #endif
305 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
int length() const
Definition: genericvector.h:85
void UpdateWindow()
Definition: scrollview.cpp:706
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
int16_t right() const
Definition: rect.h:79
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:602
void Pen(Color color)
Definition: scrollview.cpp:722
int16_t bottom() const
Definition: rect.h:65
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:534
void Brush(Color color)
Definition: scrollview.cpp:728

◆ DoesPartitionFit()

bool tesseract::StructuredTable::DoesPartitionFit ( const ColPartition part) const

Definition at line 211 of file tablerecog.cpp.

211  {
212  const TBOX& box = part.bounding_box();
213  for (int i = 0; i < cell_x_.length(); ++i)
214  if (box.left() < cell_x_[i] && cell_x_[i] < box.right())
215  return false;
216  for (int i = 0; i < cell_y_.length(); ++i)
217  if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top())
218  return false;
219  return true;
220 }
Definition: rect.h:34
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
int length() const
Definition: genericvector.h:85
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65

◆ FindCellSplitLocations()

void tesseract::StructuredTable::FindCellSplitLocations ( const GenericVector< int > &  min_list,
const GenericVector< int > &  max_list,
int  max_merged,
GenericVector< int > *  locations 
)
staticprotected

Definition at line 592 of file tablerecog.cpp.

595  {
596  locations->clear();
597  ASSERT_HOST(min_list.length() == max_list.length());
598  if (min_list.length() == 0)
599  return;
600  ASSERT_HOST(min_list.get(0) < max_list.get(0));
601  ASSERT_HOST(min_list.get(min_list.length() - 1) <
602  max_list.get(max_list.length() - 1));
603 
604  locations->push_back(min_list.get(0));
605  int min_index = 0;
606  int max_index = 0;
607  int stacked_partitions = 0;
608  int last_cross_position = INT32_MAX;
609  // max_index will expire after min_index.
610  // However, we can't "increase" the hill size if min_index expired.
611  // So finish processing when min_index expires.
612  while (min_index < min_list.length()) {
613  // Increase the hill count.
614  if (min_list[min_index] < max_list[max_index]) {
615  ++stacked_partitions;
616  if (last_cross_position != INT32_MAX &&
617  stacked_partitions > max_merged) {
618  int mid = (last_cross_position + min_list[min_index]) / 2;
619  locations->push_back(mid);
620  last_cross_position = INT32_MAX;
621  }
622  ++min_index;
623  } else {
624  // Decrease the hill count.
625  --stacked_partitions;
626  if (last_cross_position == INT32_MAX &&
627  stacked_partitions <= max_merged) {
628  last_cross_position = max_list[max_index];
629  }
630  ++max_index;
631  }
632  }
633  locations->push_back(max_list.get(max_list.length() - 1));
634 }
T & get(int index) const
int length() const
Definition: genericvector.h:85
int push_back(T object)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ FindHorizontalMargin()

int tesseract::StructuredTable::FindHorizontalMargin ( ColPartitionGrid grid,
int  start_y,
bool  decrease 
) const
protected

Definition at line 501 of file tablerecog.cpp.

502  {
503  ColPartitionGridSearch gsearch(grid);
504  gsearch.SetUniqueMode(true);
505  gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
506  ColPartition* part = nullptr;
507  while ((part = gsearch.NextSideSearch(decrease)) != nullptr) {
508  if (!part->IsTextType() && !part->IsVerticalLine())
509  continue;
510  int distance = decrease ? border - part->bounding_box().right()
511  : part->bounding_box().left() - border;
512  if (distance >= 0)
513  return distance;
514  }
515  return INT32_MAX;
516 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
int16_t top() const
Definition: rect.h:58
int16_t bottom() const
Definition: rect.h:65

◆ FindLinedStructure()

bool tesseract::StructuredTable::FindLinedStructure ( )

Definition at line 140 of file tablerecog.cpp.

140  {
141  ClearStructure();
142 
143  // Search for all of the lines in the current box.
144  // Update the cellular structure with the exact lines.
146  box_search.SetUniqueMode(true);
147  box_search.StartRectSearch(bounding_box_);
148  ColPartition* line = nullptr;
149 
150  while ((line = box_search.NextRectSearch()) != nullptr) {
151  if (line->IsHorizontalLine())
152  cell_y_.push_back(line->MidY());
153  if (line->IsVerticalLine())
154  cell_x_.push_back(line->MidX());
155  }
156 
157  // HasSignificantLines should guarantee cells.
158  // Because that code is a different class, just gracefully
159  // return false. This could be an assert.
160  if (cell_x_.length() < 3 || cell_y_.length() < 3)
161  return false;
162 
163  cell_x_.sort();
164  cell_y_.sort();
165 
166  // Remove duplicates that may have occurred due to split lines.
169 
170  // The border should be the extents of line boxes, not middle.
171  cell_x_[0] = bounding_box_.left();
175 
176  // Remove duplicates that may have occurred due to moving the borders.
179 
181  CalculateStats();
183  return is_lined_;
184 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
void compact_sorted()
int16_t left() const
Definition: rect.h:72
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
int16_t top() const
Definition: rect.h:58
int length() const
Definition: genericvector.h:85
int push_back(T object)
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238

◆ FindVerticalMargin()

int tesseract::StructuredTable::FindVerticalMargin ( ColPartitionGrid grid,
int  start_x,
bool  decrease 
) const
protected

Definition at line 484 of file tablerecog.cpp.

485  {
486  ColPartitionGridSearch gsearch(grid);
487  gsearch.SetUniqueMode(true);
488  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
489  border);
490  ColPartition* part = nullptr;
491  while ((part = gsearch.NextVerticalSearch(decrease)) != nullptr) {
492  if (!part->IsTextType() && !part->IsHorizontalLine())
493  continue;
494  int distance = decrease ? border - part->bounding_box().top()
495  : part->bounding_box().bottom() - border;
496  if (distance >= 0)
497  return distance;
498  }
499  return INT32_MAX;
500 }
int16_t left() const
Definition: rect.h:72
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
int16_t right() const
Definition: rect.h:79

◆ FindWhitespacedColumns()

void tesseract::StructuredTable::FindWhitespacedColumns ( )
protected

Definition at line 354 of file tablerecog.cpp.

354  {
355  // Set of the extents of all partitions on the page.
356  GenericVectorEqEq<int> left_sides;
357  GenericVectorEqEq<int> right_sides;
358 
359  // Look at each text partition. We want to find the partitions
360  // that have extremal left/right sides. These will give us a basis
361  // for the table columns.
363  gsearch.SetUniqueMode(true);
364  gsearch.StartRectSearch(bounding_box_);
365  ColPartition* text = nullptr;
366  while ((text = gsearch.NextRectSearch()) != nullptr) {
367  if (!text->IsTextType())
368  continue;
369 
370  ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
371  int spacing = static_cast<int>(text->median_width() *
372  kHorizontalSpacing / 2.0 + 0.5);
373  left_sides.push_back(text->bounding_box().left() - spacing);
374  right_sides.push_back(text->bounding_box().right() + spacing);
375  }
376  // It causes disaster below, so avoid it!
377  if (left_sides.length() == 0 || right_sides.length() == 0)
378  return;
379 
380  // Since data may be inserted in grid order, we sort the left/right sides.
381  left_sides.sort();
382  right_sides.sort();
383 
384  // At this point, in the "merged list", we expect to have a left side,
385  // followed by either more left sides or a right side. The last number
386  // should be a right side. We find places where the splits occur by looking
387  // for "valleys". If we want to force gap sizes or allow overlap, change
388  // the spacing above. If you want to let lines "slice" partitions as long
389  // as it is infrequent, change the following function.
390  FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold,
391  &cell_x_);
392 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
const double kHorizontalSpacing
Definition: tablerecog.cpp:35
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
static void FindCellSplitLocations(const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)
Definition: tablerecog.cpp:592
const int kCellSplitColumnThreshold
Definition: tablerecog.cpp:42
int length() const
Definition: genericvector.h:85
int push_back(T object)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ FindWhitespacedRows()

void tesseract::StructuredTable::FindWhitespacedRows ( )
protected

Definition at line 399 of file tablerecog.cpp.

399  {
400  // Set of the extents of all partitions on the page.
401  GenericVectorEqEq<int> bottom_sides;
402  GenericVectorEqEq<int> top_sides;
403  // We will be "shrinking" partitions, so keep the min/max around to
404  // make sure the bottom/top lines do not intersect text.
405  int min_bottom = INT32_MAX;
406  int max_top = INT32_MIN;
407 
408  // Look at each text partition. We want to find the partitions
409  // that have extremal bottom/top sides. These will give us a basis
410  // for the table rows. Because the textlines can be skewed and close due
411  // to warping, the height of the partitions is toned down a little bit.
413  gsearch.SetUniqueMode(true);
414  gsearch.StartRectSearch(bounding_box_);
415  ColPartition* text = nullptr;
416  while ((text = gsearch.NextRectSearch()) != nullptr) {
417  if (!text->IsTextType())
418  continue;
419 
420  ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
421  min_bottom = std::min(min_bottom, static_cast<int>(text->bounding_box().bottom()));
422  max_top = std::max(max_top, static_cast<int>(text->bounding_box().top()));
423 
424  // Ignore "tall" text partitions, as these are usually false positive
425  // vertical text or multiple lines pulled together.
426  if (text->bounding_box().height() > max_text_height_)
427  continue;
428 
429  int spacing = static_cast<int>(text->bounding_box().height() *
430  kVerticalSpacing / 2.0 + 0.5);
431  int bottom = text->bounding_box().bottom() - spacing;
432  int top = text->bounding_box().top() + spacing;
433  // For horizontal text, the factor can be negative. This should
434  // probably cause a warning or failure. I haven't actually checked if
435  // it happens.
436  if (bottom >= top)
437  continue;
438 
439  bottom_sides.push_back(bottom);
440  top_sides.push_back(top);
441  }
442  // It causes disaster below, so avoid it!
443  if (bottom_sides.length() == 0 || top_sides.length() == 0)
444  return;
445 
446  // Since data may be inserted in grid order, we sort the bottom/top sides.
447  bottom_sides.sort();
448  top_sides.sort();
449 
450  // At this point, in the "merged list", we expect to have a bottom side,
451  // followed by either more bottom sides or a top side. The last number
452  // should be a top side. We find places where the splits occur by looking
453  // for "valleys". If we want to force gap sizes or allow overlap, change
454  // the spacing above. If you want to let lines "slice" partitions as long
455  // as it is infrequent, change the following function.
456  FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold,
457  &cell_y_);
458 
459  // Recover the min/max correctly since it was shifted.
460  cell_y_[0] = min_bottom;
461  cell_y_[cell_y_.length() - 1] = max_top;
462 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
static void FindCellSplitLocations(const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)
Definition: tablerecog.cpp:592
int length() const
Definition: genericvector.h:85
int push_back(T object)
const double kVerticalSpacing
Definition: tablerecog.cpp:38
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
const int kCellSplitRowThreshold
Definition: tablerecog.cpp:41
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ FindWhitespacedStructure()

bool tesseract::StructuredTable::FindWhitespacedStructure ( )

Definition at line 187 of file tablerecog.cpp.

187  {
188  ClearStructure();
191 
192  if (!VerifyWhitespacedTable()) {
193  return false;
194  } else {
201  CalculateStats();
202  return true;
203  }
204 }
void set_top(int y)
Definition: rect.h:61
void set_bottom(int y)
Definition: rect.h:68
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
void set_right(int x)
Definition: rect.h:82
int length() const
Definition: genericvector.h:85
void set_left(int x)
Definition: rect.h:75
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244

◆ Init()

void tesseract::StructuredTable::Init ( )

Definition at line 82 of file tablerecog.cpp.

82  {
83 }

◆ is_lined()

bool tesseract::StructuredTable::is_lined ( ) const

Definition at line 94 of file tablerecog.cpp.

94  {
95  return is_lined_;
96 }

◆ median_cell_height()

int tesseract::StructuredTable::median_cell_height ( )

Definition at line 112 of file tablerecog.cpp.

112  {
113  return median_cell_height_;
114 }

◆ median_cell_width()

int tesseract::StructuredTable::median_cell_width ( )

Definition at line 115 of file tablerecog.cpp.

115  {
116  return median_cell_width_;
117 }

◆ row_count()

int tesseract::StructuredTable::row_count ( ) const

Definition at line 97 of file tablerecog.cpp.

97  {
98  return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1;
99 }
int length() const
Definition: genericvector.h:85
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244

◆ row_height()

int tesseract::StructuredTable::row_height ( int  row) const

Definition at line 118 of file tablerecog.cpp.

118  {
119  ASSERT_HOST(0 <= row && row < row_count());
120  return cell_y_[row + 1] - cell_y_[row];
121 }
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ set_bounding_box()

void tesseract::StructuredTable::set_bounding_box ( const TBOX box)

Definition at line 106 of file tablerecog.cpp.

106  {
107  bounding_box_ = box;
108 }

◆ set_line_grid()

void tesseract::StructuredTable::set_line_grid ( ColPartitionGrid lines)

Definition at line 88 of file tablerecog.cpp.

88  {
89  line_grid_ = line_grid;
90 }
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238

◆ set_max_text_height()

void tesseract::StructuredTable::set_max_text_height ( int  height)

Definition at line 91 of file tablerecog.cpp.

91  {
92  max_text_height_ = height;
93 }

◆ set_text_grid()

void tesseract::StructuredTable::set_text_grid ( ColPartitionGrid text)

Definition at line 85 of file tablerecog.cpp.

85  {
86  text_grid_ = text_grid;
87 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237

◆ space_above()

int tesseract::StructuredTable::space_above ( ) const

Definition at line 126 of file tablerecog.cpp.

126  {
127  return space_above_;
128 }

◆ space_below()

int tesseract::StructuredTable::space_below ( ) const

Definition at line 129 of file tablerecog.cpp.

129  {
130  return space_below_;
131 }

◆ UpdateMargins()

void tesseract::StructuredTable::UpdateMargins ( ColPartitionGrid grid)
protected

Definition at line 474 of file tablerecog.cpp.

474  {
475  int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
476  space_below_ = std::min(space_below_, below);
477  int above = FindVerticalMargin(grid, bounding_box_.top(), false);
478  space_above_ = std::min(space_above_, above);
479  int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
480  space_left_ = std::min(space_left_, left);
481  int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
482  space_right_ = std::min(space_right_, right);
483 }
int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const
Definition: tablerecog.cpp:501
int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const
Definition: tablerecog.cpp:484
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65

◆ VerifyLinedTableCells()

bool tesseract::StructuredTable::VerifyLinedTableCells ( )
protected

Definition at line 322 of file tablerecog.cpp.

322  {
323  // Function only called when lines exist.
324  ASSERT_HOST(cell_y_.length() >= 2 && cell_x_.length() >= 2);
325  for (int i = 0; i < cell_y_.length(); ++i) {
327  return false;
328  }
329  for (int i = 0; i < cell_x_.length(); ++i) {
331  return false;
332  }
333  return true;
334 }
int CountHorizontalIntersections(int y)
Definition: tablerecog.cpp:662
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int length() const
Definition: genericvector.h:85
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
int CountVerticalIntersections(int x)
Definition: tablerecog.cpp:638
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ VerifyRowFilled()

bool tesseract::StructuredTable::VerifyRowFilled ( int  row)

Definition at line 255 of file tablerecog.cpp.

255  {
256  for (int i = 0; i < column_count(); ++i) {
257  double area_filled = CalculateCellFilledPercentage(row, i);
258  if (area_filled >= kMinFilledArea)
259  return true;
260  }
261  return false;
262 }
const double kMinFilledArea
Definition: tablerecog.cpp:63
double CalculateCellFilledPercentage(int row, int column)
Definition: tablerecog.cpp:266

◆ VerifyWhitespacedTable()

bool tesseract::StructuredTable::VerifyWhitespacedTable ( )
protected

Definition at line 344 of file tablerecog.cpp.

344  {
345  // criteria for a table, must be at least 2x3 or 3x2
346  return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
347 }

Member Data Documentation

◆ bounding_box_

TBOX tesseract::StructuredTable::bounding_box_
protected

Definition at line 242 of file tablerecog.h.

◆ cell_x_

GenericVectorEqEq<int> tesseract::StructuredTable::cell_x_
protected

Definition at line 243 of file tablerecog.h.

◆ cell_y_

GenericVectorEqEq<int> tesseract::StructuredTable::cell_y_
protected

Definition at line 244 of file tablerecog.h.

◆ is_lined_

bool tesseract::StructuredTable::is_lined_
protected

Definition at line 245 of file tablerecog.h.

◆ line_grid_

ColPartitionGrid* tesseract::StructuredTable::line_grid_
protected

Definition at line 238 of file tablerecog.h.

◆ max_text_height_

int tesseract::StructuredTable::max_text_height_
protected

Definition at line 254 of file tablerecog.h.

◆ median_cell_height_

int tesseract::StructuredTable::median_cell_height_
protected

Definition at line 251 of file tablerecog.h.

◆ median_cell_width_

int tesseract::StructuredTable::median_cell_width_
protected

Definition at line 252 of file tablerecog.h.

◆ space_above_

int tesseract::StructuredTable::space_above_
protected

Definition at line 247 of file tablerecog.h.

◆ space_below_

int tesseract::StructuredTable::space_below_
protected

Definition at line 248 of file tablerecog.h.

◆ space_left_

int tesseract::StructuredTable::space_left_
protected

Definition at line 249 of file tablerecog.h.

◆ space_right_

int tesseract::StructuredTable::space_right_
protected

Definition at line 250 of file tablerecog.h.

◆ text_grid_

ColPartitionGrid* tesseract::StructuredTable::text_grid_
protected

Definition at line 237 of file tablerecog.h.


The documentation for this class was generated from the following files: