All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract::StructuredTable Class Reference

#include <tablerecog.h>

Public Member Functions

 StructuredTable ()
 
 ~StructuredTable ()
 
void Init ()
 
void set_text_grid (ColPartitionGrid *text)
 
void set_line_grid (ColPartitionGrid *lines)
 
void set_max_text_height (int height)
 
bool is_lined () const
 
int row_count () const
 
int column_count () const
 
int cell_count () const
 
void set_bounding_box (const TBOX &box)
 
const TBOXbounding_box () const
 
int median_cell_height ()
 
int median_cell_width ()
 
int row_height (int row) const
 
int column_width (int column) const
 
int space_above () const
 
int space_below () const
 
bool FindLinedStructure ()
 
bool FindWhitespacedStructure ()
 
bool DoesPartitionFit (const ColPartition &part) const
 
int CountFilledCells ()
 
int CountFilledCellsInRow (int row)
 
int CountFilledCellsInColumn (int column)
 
int CountFilledCells (int row_start, int row_end, int column_start, int column_end)
 
bool VerifyRowFilled (int row)
 
double CalculateCellFilledPercentage (int row, int column)
 
void Display (ScrollView *window, ScrollView::Color color)
 

Protected Member Functions

void ClearStructure ()
 
bool VerifyLinedTableCells ()
 
bool VerifyWhitespacedTable ()
 
void FindWhitespacedColumns ()
 
void FindWhitespacedRows ()
 
void CalculateMargins ()
 
void UpdateMargins (ColPartitionGrid *grid)
 
int FindVerticalMargin (ColPartitionGrid *grid, int start_x, bool decrease) const
 
int FindHorizontalMargin (ColPartitionGrid *grid, int start_y, bool decrease) const
 
void CalculateStats ()
 
void AbsorbNearbyLines ()
 
int CountVerticalIntersections (int x)
 
int CountHorizontalIntersections (int y)
 
int CountPartitions (const TBOX &box)
 

Static Protected Member Functions

static void FindCellSplitLocations (const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)
 

Protected Attributes

ColPartitionGridtext_grid_
 
ColPartitionGridline_grid_
 
TBOX bounding_box_
 
GenericVectorEqEq< int > cell_x_
 
GenericVectorEqEq< int > cell_y_
 
bool is_lined_
 
int space_above_
 
int space_below_
 
int space_left_
 
int space_right_
 
int median_cell_height_
 
int median_cell_width_
 
int max_text_height_
 

Detailed Description

Definition at line 72 of file tablerecog.h.

Constructor & Destructor Documentation

tesseract::StructuredTable::StructuredTable ( )

Definition at line 67 of file tablerecog.cpp.

tesseract::StructuredTable::~StructuredTable ( )

Definition at line 80 of file tablerecog.cpp.

80  {
81 }

Member Function Documentation

void tesseract::StructuredTable::AbsorbNearbyLines ( )
protected

Definition at line 539 of file tablerecog.cpp.

539  {
541  gsearch.SetUniqueMode(true);
542 
543  // Is the closest line above good? Loop multiple times for tables with
544  // multi-line (sometimes 2) borders. Limit the number of lines by
545  // making sure they stay within a table cell or so.
546  ColPartition* line = NULL;
547  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
548  bounding_box_.top());
549  while ((line = gsearch.NextVerticalSearch(false)) != NULL) {
550  if (!line->IsHorizontalLine())
551  break;
552  TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1,
553  bounding_box_.right(), line->MidY());
554  if (text_search.height() > median_cell_height_ * 2)
555  break;
556  if (CountPartitions(text_search) > 0)
557  break;
558  bounding_box_.set_top(line->MidY());
559  }
560  // As above, is the closest line below good?
561  line = NULL;
562  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
564  while ((line = gsearch.NextVerticalSearch(true)) != NULL) {
565  if (!line->IsHorizontalLine())
566  break;
567  TBOX text_search(bounding_box_.left(), line->MidY(),
569  if (text_search.height() > median_cell_height_ * 2)
570  break;
571  if (CountPartitions(text_search) > 0)
572  break;
573  bounding_box_.set_bottom(line->MidY());
574  }
575  // TODO(nbeato): vertical lines
576 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
inT16 right() const
Definition: rect.h:75
void set_bottom(int y)
Definition: rect.h:64
inT16 left() const
Definition: rect.h:68
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238
inT16 bottom() const
Definition: rect.h:61
Definition: rect.h:30
#define NULL
Definition: host.h:144
void set_top(int y)
Definition: rect.h:57
inT16 top() const
Definition: rect.h:54
int CountPartitions(const TBOX &box)
Definition: tablerecog.cpp:689
const TBOX & tesseract::StructuredTable::bounding_box ( ) const

Definition at line 110 of file tablerecog.cpp.

110  {
111  return bounding_box_;
112 }
double tesseract::StructuredTable::CalculateCellFilledPercentage ( int  row,
int  column 
)

Definition at line 267 of file tablerecog.cpp.

267  {
268  ASSERT_HOST(0 <= row && row <= row_count());
269  ASSERT_HOST(0 <= column && column <= column_count());
270  const TBOX kCellBox(cell_x_[column], cell_y_[row],
271  cell_x_[column + 1], cell_y_[row + 1]);
272  ASSERT_HOST(!kCellBox.null_box());
273 
275  gsearch.SetUniqueMode(true);
276  gsearch.StartRectSearch(kCellBox);
277  double area_covered = 0;
278  ColPartition* text = NULL;
279  while ((text = gsearch.NextRectSearch()) != NULL) {
280  if (text->IsTextType())
281  area_covered += text->bounding_box().intersection(kCellBox).area();
282  }
283  const inT32 current_area = kCellBox.area();
284  if (current_area == 0) {
285  return 1.0;
286  }
287  return MIN(1.0, area_covered / current_area);
288 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
#define MIN(x, y)
Definition: ndminx.h:28
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: rect.h:30
#define NULL
Definition: host.h:144
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
int inT32
Definition: host.h:102
void tesseract::StructuredTable::CalculateMargins ( )
protected

Definition at line 465 of file tablerecog.cpp.

465  {
472 }
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238
#define MAX_INT32
Definition: host.h:120
void UpdateMargins(ColPartitionGrid *grid)
Definition: tablerecog.cpp:475
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
void tesseract::StructuredTable::CalculateStats ( )
protected

Definition at line 519 of file tablerecog.cpp.

519  {
520  const int kMaxCellHeight = 1000;
521  const int kMaxCellWidth = 1000;
522  STATS height_stats(0, kMaxCellHeight + 1);
523  STATS width_stats(0, kMaxCellWidth + 1);
524 
525  for (int i = 0; i < row_count(); ++i)
526  height_stats.add(row_height(i), column_count());
527  for (int i = 0; i < column_count(); ++i)
528  width_stats.add(column_width(i), row_count());
529 
530  median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
531  median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
532 }
Definition: statistc.h:33
int column_width(int column) const
Definition: tablerecog.cpp:123
int row_height(int row) const
Definition: tablerecog.cpp:119
int tesseract::StructuredTable::cell_count ( ) const

Definition at line 104 of file tablerecog.cpp.

104  {
105  return row_count() * column_count();
106 }
void tesseract::StructuredTable::ClearStructure ( )
protected

Definition at line 309 of file tablerecog.cpp.

309  {
310  cell_x_.clear();
311  cell_y_.clear();
312  is_lined_ = false;
313  space_above_ = 0;
314  space_below_ = 0;
315  space_left_ = 0;
316  space_right_ = 0;
318  median_cell_width_ = 0;
319 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
int tesseract::StructuredTable::column_count ( ) const

Definition at line 101 of file tablerecog.cpp.

101  {
102  return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1;
103 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int length() const
Definition: genericvector.h:79
int tesseract::StructuredTable::column_width ( int  column) const

Definition at line 123 of file tablerecog.cpp.

123  {
124  ASSERT_HOST(0 <= column && column < column_count());
125  return cell_x_[column + 1] - cell_x_[column];
126 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
#define ASSERT_HOST(x)
Definition: errcode.h:84
int tesseract::StructuredTable::CountFilledCells ( )

Definition at line 224 of file tablerecog.cpp.

224  {
225  return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
226 }
int tesseract::StructuredTable::CountFilledCells ( int  row_start,
int  row_end,
int  column_start,
int  column_end 
)

Definition at line 233 of file tablerecog.cpp.

234  {
235  ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count());
236  ASSERT_HOST(0 <= column_start && column_start <= column_end &&
237  column_end < column_count());
238  int cell_count = 0;
239  TBOX cell_box;
240  for (int row = row_start; row <= row_end; ++row) {
241  cell_box.set_bottom(cell_y_[row]);
242  cell_box.set_top(cell_y_[row + 1]);
243  for (int col = column_start; col <= column_end; ++col) {
244  cell_box.set_left(cell_x_[col]);
245  cell_box.set_right(cell_x_[col + 1]);
246  if (CountPartitions(cell_box) > 0)
247  ++cell_count;
248  }
249  }
250  return cell_count;
251 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
void set_right(int x)
Definition: rect.h:78
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
void set_left(int x)
Definition: rect.h:71
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_bottom(int y)
Definition: rect.h:64
Definition: rect.h:30
void set_top(int y)
Definition: rect.h:57
int CountPartitions(const TBOX &box)
Definition: tablerecog.cpp:689
int tesseract::StructuredTable::CountFilledCellsInColumn ( int  column)

Definition at line 230 of file tablerecog.cpp.

230  {
231  return CountFilledCells(0, row_count() - 1, column, column);
232 }
int tesseract::StructuredTable::CountFilledCellsInRow ( int  row)

Definition at line 227 of file tablerecog.cpp.

227  {
228  return CountFilledCells(row, row, 0, column_count() - 1);
229 }
int tesseract::StructuredTable::CountHorizontalIntersections ( int  y)
protected

Definition at line 663 of file tablerecog.cpp.

663  {
664  int count = 0;
665  // Make a small box to keep the search time down.
666  const int kGridSize = text_grid_->gridsize();
667  TBOX horizontal_box = bounding_box_;
668  horizontal_box.set_bottom(y - kGridSize);
669  horizontal_box.set_top(y + kGridSize);
670 
672  gsearch.SetUniqueMode(true);
673  gsearch.StartRectSearch(horizontal_box);
674  ColPartition* text = NULL;
675  while ((text = gsearch.NextRectSearch()) != NULL) {
676  if (!text->IsTextType())
677  continue;
678 
679  const TBOX& box = text->bounding_box();
680  if (box.bottom() < y && y < box.top())
681  ++count;
682  }
683  return count;
684 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
void set_bottom(int y)
Definition: rect.h:64
int gridsize() const
Definition: bbgrid.h:63
inT16 bottom() const
Definition: rect.h:61
int count(LIST var_list)
Definition: oldlist.cpp:108
Definition: rect.h:30
#define NULL
Definition: host.h:144
void set_top(int y)
Definition: rect.h:57
inT16 top() const
Definition: rect.h:54
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
int tesseract::StructuredTable::CountPartitions ( const TBOX box)
protected

Definition at line 689 of file tablerecog.cpp.

689  {
691  gsearch.SetUniqueMode(true);
692  gsearch.StartRectSearch(box);
693  int count = 0;
694  ColPartition* text = NULL;
695  while ((text = gsearch.NextRectSearch()) != NULL) {
696  if (text->IsTextType())
697  ++count;
698  }
699  return count;
700 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
int count(LIST var_list)
Definition: oldlist.cpp:108
#define NULL
Definition: host.h:144
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
int tesseract::StructuredTable::CountVerticalIntersections ( int  x)
protected

Definition at line 639 of file tablerecog.cpp.

639  {
640  int count = 0;
641  // Make a small box to keep the search time down.
642  const int kGridSize = text_grid_->gridsize();
643  TBOX vertical_box = bounding_box_;
644  vertical_box.set_left(x - kGridSize);
645  vertical_box.set_right(x + kGridSize);
646 
648  gsearch.SetUniqueMode(true);
649  gsearch.StartRectSearch(vertical_box);
650  ColPartition* text = NULL;
651  while ((text = gsearch.NextRectSearch()) != NULL) {
652  if (!text->IsTextType())
653  continue;
654  const TBOX& box = text->bounding_box();
655  if (box.left() < x && x < box.right())
656  ++count;
657  }
658  return count;
659 }
void set_right(int x)
Definition: rect.h:78
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
inT16 right() const
Definition: rect.h:75
void set_left(int x)
Definition: rect.h:71
inT16 left() const
Definition: rect.h:68
int gridsize() const
Definition: bbgrid.h:63
int count(LIST var_list)
Definition: oldlist.cpp:108
Definition: rect.h:30
#define NULL
Definition: host.h:144
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
void tesseract::StructuredTable::Display ( ScrollView window,
ScrollView::Color  color 
)

Definition at line 290 of file tablerecog.cpp.

290  {
291 #ifndef GRAPHICS_DISABLED
292  window->Brush(ScrollView::NONE);
293  window->Pen(color);
296  for (int i = 0; i < cell_x_.length(); i++) {
297  window->Line(cell_x_[i], bounding_box_.bottom(),
298  cell_x_[i], bounding_box_.top());
299  }
300  for (int i = 0; i < cell_y_.length(); i++) {
301  window->Line(bounding_box_.left(), cell_y_[i],
302  bounding_box_.right(), cell_y_[i]);
303  }
304  window->UpdateWindow();
305 #endif
306 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
void Pen(Color color)
Definition: scrollview.cpp:726
int length() const
Definition: genericvector.h:79
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
inT16 right() const
Definition: rect.h:75
inT16 left() const
Definition: rect.h:68
void Brush(Color color)
Definition: scrollview.cpp:732
void UpdateWindow()
Definition: scrollview.cpp:710
inT16 bottom() const
Definition: rect.h:61
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
inT16 top() const
Definition: rect.h:54
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:538
bool tesseract::StructuredTable::DoesPartitionFit ( const ColPartition part) const

Definition at line 212 of file tablerecog.cpp.

212  {
213  const TBOX& box = part.bounding_box();
214  for (int i = 0; i < cell_x_.length(); ++i)
215  if (box.left() < cell_x_[i] && cell_x_[i] < box.right())
216  return false;
217  for (int i = 0; i < cell_y_.length(); ++i)
218  if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top())
219  return false;
220  return true;
221 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int length() const
Definition: genericvector.h:79
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
inT16 right() const
Definition: rect.h:75
inT16 left() const
Definition: rect.h:68
inT16 bottom() const
Definition: rect.h:61
Definition: rect.h:30
inT16 top() const
Definition: rect.h:54
void tesseract::StructuredTable::FindCellSplitLocations ( const GenericVector< int > &  min_list,
const GenericVector< int > &  max_list,
int  max_merged,
GenericVector< int > *  locations 
)
staticprotected

Definition at line 593 of file tablerecog.cpp.

596  {
597  locations->clear();
598  ASSERT_HOST(min_list.length() == max_list.length());
599  if (min_list.length() == 0)
600  return;
601  ASSERT_HOST(min_list.get(0) < max_list.get(0));
602  ASSERT_HOST(min_list.get(min_list.length() - 1) <
603  max_list.get(max_list.length() - 1));
604 
605  locations->push_back(min_list.get(0));
606  int min_index = 0;
607  int max_index = 0;
608  int stacked_partitions = 0;
609  int last_cross_position = MAX_INT32;
610  // max_index will expire after min_index.
611  // However, we can't "increase" the hill size if min_index expired.
612  // So finish processing when min_index expires.
613  while (min_index < min_list.length()) {
614  // Increase the hill count.
615  if (min_list[min_index] < max_list[max_index]) {
616  ++stacked_partitions;
617  if (last_cross_position != MAX_INT32 &&
618  stacked_partitions > max_merged) {
619  int mid = (last_cross_position + min_list[min_index]) / 2;
620  locations->push_back(mid);
621  last_cross_position = MAX_INT32;
622  }
623  ++min_index;
624  } else {
625  // Decrease the hill count.
626  --stacked_partitions;
627  if (last_cross_position == MAX_INT32 &&
628  stacked_partitions <= max_merged) {
629  last_cross_position = max_list[max_index];
630  }
631  ++max_index;
632  }
633  }
634  locations->push_back(max_list.get(max_list.length() - 1));
635 }
int length() const
Definition: genericvector.h:79
int push_back(T object)
#define ASSERT_HOST(x)
Definition: errcode.h:84
#define MAX_INT32
Definition: host.h:120
T & get(int index) const
int tesseract::StructuredTable::FindHorizontalMargin ( ColPartitionGrid grid,
int  start_y,
bool  decrease 
) const
protected

Definition at line 502 of file tablerecog.cpp.

503  {
504  ColPartitionGridSearch gsearch(grid);
505  gsearch.SetUniqueMode(true);
506  gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
507  ColPartition* part = NULL;
508  while ((part = gsearch.NextSideSearch(decrease)) != NULL) {
509  if (!part->IsTextType() && !part->IsVerticalLine())
510  continue;
511  int distance = decrease ? border - part->bounding_box().right()
512  : part->bounding_box().left() - border;
513  if (distance >= 0)
514  return distance;
515  }
516  return MAX_INT32;
517 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
#define MAX_INT32
Definition: host.h:120
inT16 bottom() const
Definition: rect.h:61
#define NULL
Definition: host.h:144
inT16 top() const
Definition: rect.h:54
bool tesseract::StructuredTable::FindLinedStructure ( )

Definition at line 141 of file tablerecog.cpp.

141  {
142  ClearStructure();
143 
144  // Search for all of the lines in the current box.
145  // Update the cellular structure with the exact lines.
147  box_search.SetUniqueMode(true);
148  box_search.StartRectSearch(bounding_box_);
149  ColPartition* line = NULL;
150 
151  while ((line = box_search.NextRectSearch()) != NULL) {
152  if (line->IsHorizontalLine())
153  cell_y_.push_back(line->MidY());
154  if (line->IsVerticalLine())
155  cell_x_.push_back(line->MidX());
156  }
157 
158  // HasSignificantLines should guarantee cells.
159  // Because that code is a different class, just gracefully
160  // return false. This could be an assert.
161  if (cell_x_.length() < 3 || cell_y_.length() < 3)
162  return false;
163 
164  cell_x_.sort();
165  cell_y_.sort();
166 
167  // Remove duplicates that may have occurred due to split lines.
170 
171  // The border should be the extents of line boxes, not middle.
172  cell_x_[0] = bounding_box_.left();
176 
177  // Remove duplicates that may have occurred due to moving the borders.
180 
182  CalculateStats();
184  return is_lined_;
185 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int length() const
Definition: genericvector.h:79
int push_back(T object)
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
inT16 right() const
Definition: rect.h:75
void compact_sorted()
inT16 left() const
Definition: rect.h:68
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238
inT16 bottom() const
Definition: rect.h:61
#define NULL
Definition: host.h:144
inT16 top() const
Definition: rect.h:54
int tesseract::StructuredTable::FindVerticalMargin ( ColPartitionGrid grid,
int  start_x,
bool  decrease 
) const
protected

Definition at line 485 of file tablerecog.cpp.

486  {
487  ColPartitionGridSearch gsearch(grid);
488  gsearch.SetUniqueMode(true);
489  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
490  border);
491  ColPartition* part = NULL;
492  while ((part = gsearch.NextVerticalSearch(decrease)) != NULL) {
493  if (!part->IsTextType() && !part->IsHorizontalLine())
494  continue;
495  int distance = decrease ? border - part->bounding_box().top()
496  : part->bounding_box().bottom() - border;
497  if (distance >= 0)
498  return distance;
499  }
500  return MAX_INT32;
501 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
inT16 right() const
Definition: rect.h:75
inT16 left() const
Definition: rect.h:68
#define MAX_INT32
Definition: host.h:120
#define NULL
Definition: host.h:144
void tesseract::StructuredTable::FindWhitespacedColumns ( )
protected

Definition at line 355 of file tablerecog.cpp.

355  {
356  // Set of the extents of all partitions on the page.
357  GenericVectorEqEq<int> left_sides;
358  GenericVectorEqEq<int> right_sides;
359 
360  // Look at each text partition. We want to find the partitions
361  // that have extremal left/right sides. These will give us a basis
362  // for the table columns.
364  gsearch.SetUniqueMode(true);
365  gsearch.StartRectSearch(bounding_box_);
366  ColPartition* text = NULL;
367  while ((text = gsearch.NextRectSearch()) != NULL) {
368  if (!text->IsTextType())
369  continue;
370 
371  ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
372  int spacing = static_cast<int>(text->median_width() *
373  kHorizontalSpacing / 2.0 + 0.5);
374  left_sides.push_back(text->bounding_box().left() - spacing);
375  right_sides.push_back(text->bounding_box().right() + spacing);
376  }
377  // It causes disaster below, so avoid it!
378  if (left_sides.length() == 0 || right_sides.length() == 0)
379  return;
380 
381  // Since data may be inserted in grid order, we sort the left/right sides.
382  left_sides.sort();
383  right_sides.sort();
384 
385  // At this point, in the "merged list", we expect to have a left side,
386  // followed by either more left sides or a right side. The last number
387  // should be a right side. We find places where the splits occur by looking
388  // for "valleys". If we want to force gap sizes or allow overlap, change
389  // the spacing above. If you want to let lines "slice" partitions as long
390  // as it is infrequent, change the following function.
391  FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold,
392  &cell_x_);
393 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int length() const
Definition: genericvector.h:79
int push_back(T object)
static void FindCellSplitLocations(const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)
Definition: tablerecog.cpp:593
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
#define ASSERT_HOST(x)
Definition: errcode.h:84
const int kCellSplitColumnThreshold
Definition: tablerecog.cpp:40
const double kHorizontalSpacing
Definition: tablerecog.cpp:33
#define NULL
Definition: host.h:144
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
void tesseract::StructuredTable::FindWhitespacedRows ( )
protected

Definition at line 400 of file tablerecog.cpp.

400  {
401  // Set of the extents of all partitions on the page.
402  GenericVectorEqEq<int> bottom_sides;
403  GenericVectorEqEq<int> top_sides;
404  // We will be "shrinking" partitions, so keep the min/max around to
405  // make sure the bottom/top lines do not intersect text.
406  int min_bottom = MAX_INT32;
407  int max_top = MIN_INT32;
408 
409  // Look at each text partition. We want to find the partitions
410  // that have extremal bottom/top sides. These will give us a basis
411  // for the table rows. Because the textlines can be skewed and close due
412  // to warping, the height of the partitions is toned down a little bit.
414  gsearch.SetUniqueMode(true);
415  gsearch.StartRectSearch(bounding_box_);
416  ColPartition* text = NULL;
417  while ((text = gsearch.NextRectSearch()) != NULL) {
418  if (!text->IsTextType())
419  continue;
420 
421  ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
422  min_bottom = MIN(min_bottom, text->bounding_box().bottom());
423  max_top = MAX(max_top, text->bounding_box().top());
424 
425  // Ignore "tall" text partitions, as these are usually false positive
426  // vertical text or multiple lines pulled together.
427  if (text->bounding_box().height() > max_text_height_)
428  continue;
429 
430  int spacing = static_cast<int>(text->bounding_box().height() *
431  kVerticalSpacing / 2.0 + 0.5);
432  int bottom = text->bounding_box().bottom() - spacing;
433  int top = text->bounding_box().top() + spacing;
434  // For horizontal text, the factor can be negative. This should
435  // probably cause a warning or failure. I haven't actually checked if
436  // it happens.
437  if (bottom >= top)
438  continue;
439 
440  bottom_sides.push_back(bottom);
441  top_sides.push_back(top);
442  }
443  // It causes disaster below, so avoid it!
444  if (bottom_sides.length() == 0 || top_sides.length() == 0)
445  return;
446 
447  // Since data may be inserted in grid order, we sort the bottom/top sides.
448  bottom_sides.sort();
449  top_sides.sort();
450 
451  // At this point, in the "merged list", we expect to have a bottom side,
452  // followed by either more bottom sides or a top side. The last number
453  // should be a top side. We find places where the splits occur by looking
454  // for "valleys". If we want to force gap sizes or allow overlap, change
455  // the spacing above. If you want to let lines "slice" partitions as long
456  // as it is infrequent, change the following function.
457  FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold,
458  &cell_y_);
459 
460  // Recover the min/max correctly since it was shifted.
461  cell_y_[0] = min_bottom;
462  cell_y_[cell_y_.length() - 1] = max_top;
463 }
int length() const
Definition: genericvector.h:79
#define MAX(x, y)
Definition: ndminx.h:24
int push_back(T object)
#define MIN(x, y)
Definition: ndminx.h:28
static void FindCellSplitLocations(const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)
Definition: tablerecog.cpp:593
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
#define MIN_INT32
Definition: host.h:128
#define ASSERT_HOST(x)
Definition: errcode.h:84
const double kVerticalSpacing
Definition: tablerecog.cpp:36
#define MAX_INT32
Definition: host.h:120
#define NULL
Definition: host.h:144
const int kCellSplitRowThreshold
Definition: tablerecog.cpp:39
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
bool tesseract::StructuredTable::FindWhitespacedStructure ( )

Definition at line 188 of file tablerecog.cpp.

188  {
189  ClearStructure();
192 
193  if (!VerifyWhitespacedTable()) {
194  return false;
195  } else {
202  CalculateStats();
203  return true;
204  }
205 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int length() const
Definition: genericvector.h:79
void set_right(int x)
Definition: rect.h:78
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
void set_left(int x)
Definition: rect.h:71
void set_bottom(int y)
Definition: rect.h:64
void set_top(int y)
Definition: rect.h:57
void tesseract::StructuredTable::Init ( )

Definition at line 83 of file tablerecog.cpp.

83  {
84 }
bool tesseract::StructuredTable::is_lined ( ) const

Definition at line 95 of file tablerecog.cpp.

95  {
96  return is_lined_;
97 }
int tesseract::StructuredTable::median_cell_height ( )

Definition at line 113 of file tablerecog.cpp.

113  {
114  return median_cell_height_;
115 }
int tesseract::StructuredTable::median_cell_width ( )

Definition at line 116 of file tablerecog.cpp.

116  {
117  return median_cell_width_;
118 }
int tesseract::StructuredTable::row_count ( ) const

Definition at line 98 of file tablerecog.cpp.

98  {
99  return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1;
100 }
int length() const
Definition: genericvector.h:79
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
int tesseract::StructuredTable::row_height ( int  row) const

Definition at line 119 of file tablerecog.cpp.

119  {
120  ASSERT_HOST(0 <= row && row < row_count());
121  return cell_y_[row + 1] - cell_y_[row];
122 }
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::StructuredTable::set_bounding_box ( const TBOX box)

Definition at line 107 of file tablerecog.cpp.

107  {
108  bounding_box_ = box;
109 }
void tesseract::StructuredTable::set_line_grid ( ColPartitionGrid lines)

Definition at line 89 of file tablerecog.cpp.

89  {
90  line_grid_ = line_grid;
91 }
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238
void tesseract::StructuredTable::set_max_text_height ( int  height)

Definition at line 92 of file tablerecog.cpp.

92  {
93  max_text_height_ = height;
94 }
void tesseract::StructuredTable::set_text_grid ( ColPartitionGrid text)

Definition at line 86 of file tablerecog.cpp.

86  {
87  text_grid_ = text_grid;
88 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
int tesseract::StructuredTable::space_above ( ) const

Definition at line 127 of file tablerecog.cpp.

127  {
128  return space_above_;
129 }
int tesseract::StructuredTable::space_below ( ) const

Definition at line 130 of file tablerecog.cpp.

130  {
131  return space_below_;
132 }
void tesseract::StructuredTable::UpdateMargins ( ColPartitionGrid grid)
protected

Definition at line 475 of file tablerecog.cpp.

475  {
476  int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
477  space_below_ = MIN(space_below_, below);
478  int above = FindVerticalMargin(grid, bounding_box_.top(), false);
479  space_above_ = MIN(space_above_, above);
480  int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
481  space_left_ = MIN(space_left_, left);
482  int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
483  space_right_ = MIN(space_right_, right);
484 }
#define MIN(x, y)
Definition: ndminx.h:28
int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const
Definition: tablerecog.cpp:485
inT16 right() const
Definition: rect.h:75
inT16 left() const
Definition: rect.h:68
int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const
Definition: tablerecog.cpp:502
inT16 bottom() const
Definition: rect.h:61
inT16 top() const
Definition: rect.h:54
bool tesseract::StructuredTable::VerifyLinedTableCells ( )
protected

Definition at line 323 of file tablerecog.cpp.

323  {
324  // Function only called when lines exist.
325  ASSERT_HOST(cell_y_.length() >= 2 && cell_x_.length() >= 2);
326  for (int i = 0; i < cell_y_.length(); ++i) {
328  return false;
329  }
330  for (int i = 0; i < cell_x_.length(); ++i) {
332  return false;
333  }
334  return true;
335 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int length() const
Definition: genericvector.h:79
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
#define ASSERT_HOST(x)
Definition: errcode.h:84
int CountVerticalIntersections(int x)
Definition: tablerecog.cpp:639
int CountHorizontalIntersections(int y)
Definition: tablerecog.cpp:663
bool tesseract::StructuredTable::VerifyRowFilled ( int  row)

Definition at line 256 of file tablerecog.cpp.

256  {
257  for (int i = 0; i < column_count(); ++i) {
258  double area_filled = CalculateCellFilledPercentage(row, i);
259  if (area_filled >= kMinFilledArea)
260  return true;
261  }
262  return false;
263 }
double CalculateCellFilledPercentage(int row, int column)
Definition: tablerecog.cpp:267
const double kMinFilledArea
Definition: tablerecog.cpp:61
bool tesseract::StructuredTable::VerifyWhitespacedTable ( )
protected

Definition at line 345 of file tablerecog.cpp.

345  {
346  // criteria for a table, must be at least 2x3 or 3x2
347  return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
348 }

Member Data Documentation

TBOX tesseract::StructuredTable::bounding_box_
protected

Definition at line 242 of file tablerecog.h.

GenericVectorEqEq<int> tesseract::StructuredTable::cell_x_
protected

Definition at line 243 of file tablerecog.h.

GenericVectorEqEq<int> tesseract::StructuredTable::cell_y_
protected

Definition at line 244 of file tablerecog.h.

bool tesseract::StructuredTable::is_lined_
protected

Definition at line 245 of file tablerecog.h.

ColPartitionGrid* tesseract::StructuredTable::line_grid_
protected

Definition at line 238 of file tablerecog.h.

int tesseract::StructuredTable::max_text_height_
protected

Definition at line 254 of file tablerecog.h.

int tesseract::StructuredTable::median_cell_height_
protected

Definition at line 251 of file tablerecog.h.

int tesseract::StructuredTable::median_cell_width_
protected

Definition at line 252 of file tablerecog.h.

int tesseract::StructuredTable::space_above_
protected

Definition at line 247 of file tablerecog.h.

int tesseract::StructuredTable::space_below_
protected

Definition at line 248 of file tablerecog.h.

int tesseract::StructuredTable::space_left_
protected

Definition at line 249 of file tablerecog.h.

int tesseract::StructuredTable::space_right_
protected

Definition at line 250 of file tablerecog.h.

ColPartitionGrid* tesseract::StructuredTable::text_grid_
protected

Definition at line 237 of file tablerecog.h.


The documentation for this class was generated from the following files: