All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract::TableFinder Class Reference

#include <tablefind.h>

Public Member Functions

 TableFinder ()
 
 ~TableFinder ()
 
void set_resolution (int resolution)
 
void set_left_to_right_language (bool order)
 
void Init (int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
 
void InsertCleanPartitions (ColPartitionGrid *grid, TO_BLOCK *block)
 
void LocateTables (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb, const FCOORD &reskew)
 

Protected Member Functions

int gridsize () const
 
int gridwidth () const
 
int gridheight () const
 
const ICOORDbleft () const
 
const ICOORDtright () const
 
ScrollViewMakeWindow (int x, int y, const char *window_name)
 
void InsertTextPartition (ColPartition *part)
 
void InsertFragmentedTextPartition (ColPartition *part)
 
void InsertLeaderPartition (ColPartition *part)
 
void InsertRulingPartition (ColPartition *part)
 
void InsertImagePartition (ColPartition *part)
 
void SplitAndInsertFragmentedTextPartition (ColPartition *part)
 
bool AllowTextPartition (const ColPartition &part) const
 
bool AllowBlob (const BLOBNBOX &blob) const
 
void MoveColSegmentsToGrid (ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
 
void InitializePartitions (ColPartitionSet **all_columns)
 
void SetVerticalSpacing (ColPartition *part)
 
void SetGlobalSpacings (ColPartitionGrid *grid)
 
void set_global_median_xheight (int xheight)
 
void set_global_median_blob_width (int width)
 
void set_global_median_ledding (int ledding)
 
void FindNeighbors ()
 
void MarkTablePartitions ()
 
void MarkPartitionsUsingLocalInformation ()
 
bool HasWideOrNoInterWordGap (ColPartition *part) const
 
bool HasLeaderAdjacent (const ColPartition &part)
 
void FilterFalseAlarms ()
 
void FilterParagraphEndings ()
 
void FilterHeaderAndFooter ()
 
void SmoothTablePartitionRuns ()
 
void GetColumnBlocks (ColPartitionSet **columns, ColSegment_LIST *col_segments)
 
void GroupColumnBlocks (ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
 
bool ConsecutiveBoxes (const TBOX &b1, const TBOX &b2)
 
void SetColumnsType (ColSegment_LIST *col_segments)
 
void GridMergeColumnBlocks ()
 
void GetTableColumns (ColSegment_LIST *table_columns)
 
void GetTableRegions (ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
 
void GridMergeTableRegions ()
 
bool BelongToOneTable (const TBOX &box1, const TBOX &box2)
 
void AdjustTableBoundaries ()
 
void GrowTableBox (const TBOX &table_box, TBOX *result_box)
 
void GrowTableToIncludePartials (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
void GrowTableToIncludeLines (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
bool HLineBelongsToTable (const ColPartition &part, const TBOX &table_box)
 
void IncludeLeftOutColumnHeaders (TBOX *table_box)
 
void DeleteSingleColumnTables ()
 
bool GapInXProjection (int *xprojection, int length)
 
void RecognizeTables ()
 
void DisplayColSegments (ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColPartitionConnections (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColSegmentGrid (ScrollView *win, ColSegmentGrid *grid, ScrollView::Color color)
 
void WriteToPix (const FCOORD &reskew)
 
void MakeTableBlocks (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
 

Static Protected Member Functions

static void SetPartitionSpacings (ColPartitionGrid *grid, ColPartitionSet **all_columns)
 

Protected Attributes

int resolution_
 
int global_median_xheight_
 
int global_median_blob_width_
 
int global_median_ledding_
 
ColPartitionGrid clean_part_grid_
 
ColPartitionGrid leader_and_ruling_grid_
 
ColPartitionGrid fragmented_text_grid_
 
ColSegmentGrid col_seg_grid_
 
ColSegmentGrid table_grid_
 
bool left_to_right_language_
 

Detailed Description

Definition at line 131 of file tablefind.h.

Constructor & Destructor Documentation

tesseract::TableFinder::TableFinder ( )

Definition at line 169 of file tablefind.cpp.

tesseract::TableFinder::~TableFinder ( )

Definition at line 177 of file tablefind.cpp.

177  {
178  // ColPartitions and ColSegments created by this class for storage in grids
179  // need to be deleted explicitly.
180  clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
181  leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
182  fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
183  col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
184  table_grid_.ClearGridData(&DeleteObject<ColSegment>);
185 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
ColSegmentGrid table_grid_
Definition: tablefind.h:428
void ClearGridData(void(*free_method)(BBC *))
Definition: bbgrid.h:467
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424

Member Function Documentation

void tesseract::TableFinder::AdjustTableBoundaries ( )
protected

Definition at line 1499 of file tablefind.cpp.

1499  {
1500  // Iterate the table regions in the grid
1501  ColSegment_CLIST adjusted_tables;
1502  ColSegment_C_IT it(&adjusted_tables);
1504  gsearch.StartFullSearch();
1505  ColSegment* table = NULL;
1506  while ((table = gsearch.NextFullSearch()) != NULL) {
1507  const TBOX& table_box = table->bounding_box();
1508  TBOX grown_box = table_box;
1509  GrowTableBox(table_box, &grown_box);
1510  // To prevent a table from expanding again, do not insert the
1511  // modified box back to the grid. Instead move it to a list and
1512  // and remove it from the grid. The list is moved later back to the grid.
1513  if (!grown_box.null_box()) {
1514  ColSegment* col = new ColSegment();
1515  col->InsertBox(grown_box);
1516  it.add_after_then_move(col);
1517  }
1518  gsearch.RemoveBBox();
1519  delete table;
1520  }
1521  // clear table grid to move final tables in it
1522  // TODO(nbeato): table_grid_ should already be empty. The above loop
1523  // removed everything. Maybe just assert it is empty?
1524  table_grid_.Clear();
1525  it.move_to_first();
1526  // move back final tables to table_grid_
1527  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1528  ColSegment* seg = it.extract();
1529  table_grid_.InsertBBox(true, true, seg);
1530  }
1531 }
GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > ColSegmentGridSearch
Definition: tablefind.h:121
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
Definition: tablefind.cpp:1533
void Clear()
Definition: bbgrid.h:458
bool null_box() const
Definition: rect.h:46
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
Definition: rect.h:30
ColSegmentGrid table_grid_
Definition: tablefind.h:428
#define NULL
Definition: host.h:144
bool tesseract::TableFinder::AllowBlob ( const BLOBNBOX blob) const
protected

Definition at line 515 of file tablefind.cpp.

515  {
516  const TBOX& box = blob.bounding_box();
517  const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
518  const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
519  const int median_area = global_median_xheight_ * global_median_blob_width_;
520  const double kAreaRequired = median_area * kAllowBlobArea;
521  // Keep comparisons strictly greater to disallow 0!
522  return box.height() > kHeightRequired &&
523  box.width() > kWidthRequired &&
524  box.area() > kAreaRequired;
525 }
const double kAllowBlobWidth
Definition: tablefind.cpp:60
inT32 area() const
Definition: rect.h:118
const double kAllowBlobHeight
Definition: tablefind.cpp:59
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111
Definition: rect.h:30
const double kAllowBlobArea
Definition: tablefind.cpp:61
const TBOX & bounding_box() const
Definition: blobbox.h:215
bool tesseract::TableFinder::AllowTextPartition ( const ColPartition part) const
protected

Definition at line 502 of file tablefind.cpp.

502  {
503  const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
504  const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
505  const int median_area = global_median_xheight_ * global_median_blob_width_;
506  const double kAreaPerBlobRequired = median_area * kAllowTextArea;
507  // Keep comparisons strictly greater to disallow 0!
508  return part.median_size() > kHeightRequired &&
509  part.median_width() > kWidthRequired &&
510  part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
511 }
const double kAllowTextHeight
Definition: tablefind.cpp:52
const double kAllowTextWidth
Definition: tablefind.cpp:53
const double kAllowTextArea
Definition: tablefind.cpp:54
bool tesseract::TableFinder::BelongToOneTable ( const TBOX box1,
const TBOX box2 
)
protected

Definition at line 1457 of file tablefind.cpp.

1457  {
1458  // Check the obvious case. Most likely not true because overlapping boxes
1459  // should already be merged, but seems like a good thing to do in case things
1460  // change.
1461  if (box1.overlap(box2))
1462  return true;
1463  // Check for ColPartitions spanning both table regions
1464  TBOX bbox = box1.bounding_union(box2);
1465  // Start a rect search on bbox
1466  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1467  rectsearch(&clean_part_grid_);
1468  rectsearch.StartRectSearch(bbox);
1469  ColPartition* part = NULL;
1470  while ((part = rectsearch.NextRectSearch()) != NULL) {
1471  const TBOX& part_box = part->bounding_box();
1472  // return true if a colpartition spanning both table regions is found
1473  if (part_box.overlap(box1) && part_box.overlap(box2) &&
1474  !part->IsImageType())
1475  return true;
1476  }
1477  return false;
1478 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
Definition: rect.h:30
#define NULL
Definition: host.h:144
bool overlap(const TBOX &box) const
Definition: rect.h:345
const ICOORD & tesseract::TableFinder::bleft ( ) const
protected

Definition at line 400 of file tablefind.cpp.

400  {
401  return clean_part_grid_.bleft();
402 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
const ICOORD & bleft() const
Definition: bbgrid.h:72
bool tesseract::TableFinder::ConsecutiveBoxes ( const TBOX b1,
const TBOX b2 
)
protected

Definition at line 581 of file tablefind.cpp.

581  {
582  int x_margin = 20;
583  int y_margin = 5;
584  return (abs(b1.left() - b2.left()) < x_margin) &&
585  (abs(b1.right() - b2.right()) < x_margin) &&
586  (abs(b1.top()-b2.bottom()) < y_margin ||
587  abs(b2.top()-b1.bottom()) < y_margin);
588 }
inT16 right() const
Definition: rect.h:75
inT16 left() const
Definition: rect.h:68
inT16 bottom() const
Definition: rect.h:61
inT16 top() const
Definition: rect.h:54
void tesseract::TableFinder::DeleteSingleColumnTables ( )
protected

Definition at line 1716 of file tablefind.cpp.

1716  {
1717  int page_width = tright().x() - bleft().x();
1718  ASSERT_HOST(page_width > 0);
1719  // create an integer array to hold projection on x-axis
1720  int* table_xprojection = new int[page_width];
1721  // Iterate through all tables in the table grid
1722  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1723  table_search(&table_grid_);
1724  table_search.StartFullSearch();
1725  ColSegment* table;
1726  while ((table = table_search.NextFullSearch()) != NULL) {
1727  TBOX table_box = table->bounding_box();
1728  // reset the projection array
1729  for (int i = 0; i < page_width; i++) {
1730  table_xprojection[i] = 0;
1731  }
1732  // Start a rect search on table_box
1733  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1734  rectsearch(&clean_part_grid_);
1735  rectsearch.SetUniqueMode(true);
1736  rectsearch.StartRectSearch(table_box);
1737  ColPartition* part;
1738  while ((part = rectsearch.NextRectSearch()) != NULL) {
1739  if (!part->IsTextType())
1740  continue; // Do not consider non-text partitions
1741  if (part->flow() == BTFT_LEADER)
1742  continue; // Assume leaders are in tables
1743  TBOX part_box = part->bounding_box();
1744  // Do not consider partitions partially covered by the table
1745  if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
1746  continue;
1747  BLOBNBOX_CLIST* part_boxes = part->boxes();
1748  BLOBNBOX_C_IT pit(part_boxes);
1749 
1750  // Make sure overlapping blobs don't artificially inflate the number
1751  // of rows in the table. This happens frequently with things such as
1752  // decimals and split characters. Do this by assuming the column
1753  // partition is sorted mostly left to right and just clip
1754  // bounding boxes by the previous box's extent.
1755  int next_position_to_write = 0;
1756 
1757  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1758  BLOBNBOX *pblob = pit.data();
1759  // ignore blob height for the purpose of projection since we
1760  // are only interested in finding valleys
1761  int xstart = pblob->bounding_box().left();
1762  int xend = pblob->bounding_box().right();
1763 
1764  xstart = MAX(xstart, next_position_to_write);
1765  for (int i = xstart; i < xend; i++)
1766  table_xprojection[i - bleft().x()]++;
1767  next_position_to_write = xend;
1768  }
1769  }
1770  // Find largest valley between two reasonable peaks in the table
1771  if (!GapInXProjection(table_xprojection, page_width)) {
1772  table_search.RemoveBBox();
1773  delete table;
1774  }
1775  }
1776  delete[] table_xprojection;
1777 }
const double kMinOverlapWithTable
Definition: tablefind.cpp:100
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define MAX(x, y)
Definition: ndminx.h:24
const ICOORD & bleft() const
Definition: tablefind.cpp:400
inT16 right() const
Definition: rect.h:75
#define ASSERT_HOST(x)
Definition: errcode.h:84
const ICOORD & tright() const
Definition: tablefind.cpp:403
inT16 left() const
Definition: rect.h:68
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
inT16 x() const
access function
Definition: points.h:52
Definition: rect.h:30
ColSegmentGrid table_grid_
Definition: tablefind.h:428
#define NULL
Definition: host.h:144
const TBOX & bounding_box() const
Definition: blobbox.h:215
bool GapInXProjection(int *xprojection, int length)
Definition: tablefind.cpp:1781
void tesseract::TableFinder::DisplayColPartitionConnections ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1963 of file tablefind.cpp.

1966  {
1967 #ifndef GRAPHICS_DISABLED
1968  // Iterate the ColPartitions in the grid.
1969  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1970  gsearch(grid);
1971  gsearch.StartFullSearch();
1972  ColPartition* part = NULL;
1973  while ((part = gsearch.NextFullSearch()) != NULL) {
1974  const TBOX& box = part->bounding_box();
1975  int left_x = box.left();
1976  int right_x = box.right();
1977  int top_y = box.top();
1978  int bottom_y = box.bottom();
1979 
1980  ColPartition* upper_part = part->nearest_neighbor_above();
1981  if (upper_part) {
1982  TBOX upper_box = upper_part->bounding_box();
1983  int mid_x = (left_x + right_x) / 2;
1984  int mid_y = (top_y + bottom_y) / 2;
1985  int other_x = (upper_box.left() + upper_box.right()) / 2;
1986  int other_y = (upper_box.top() + upper_box.bottom()) / 2;
1987  win->Brush(ScrollView::NONE);
1988  win->Pen(color);
1989  win->Line(mid_x, mid_y, other_x, other_y);
1990  }
1991  ColPartition* lower_part = part->nearest_neighbor_below();
1992  if (lower_part) {
1993  TBOX lower_box = lower_part->bounding_box();
1994  int mid_x = (left_x + right_x) / 2;
1995  int mid_y = (top_y + bottom_y) / 2;
1996  int other_x = (lower_box.left() + lower_box.right()) / 2;
1997  int other_y = (lower_box.top() + lower_box.bottom()) / 2;
1998  win->Brush(ScrollView::NONE);
1999  win->Pen(color);
2000  win->Line(mid_x, mid_y, other_x, other_y);
2001  }
2002  }
2003  win->UpdateWindow();
2004 #endif
2005 }
void Pen(Color color)
Definition: scrollview.cpp:726
inT16 right() const
Definition: rect.h:75
inT16 left() const
Definition: rect.h:68
void Brush(Color color)
Definition: scrollview.cpp:732
void UpdateWindow()
Definition: scrollview.cpp:710
inT16 bottom() const
Definition: rect.h:61
Definition: rect.h:30
#define NULL
Definition: host.h:144
inT16 top() const
Definition: rect.h:54
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:538
void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  text_color,
ScrollView::Color  table_color 
)
protected

Definition at line 1929 of file tablefind.cpp.

1932  {
1933 #ifndef GRAPHICS_DISABLED
1934  ScrollView::Color color = default_color;
1935  // Iterate the ColPartitions in the grid.
1936  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1937  gsearch(grid);
1938  gsearch.StartFullSearch();
1939  ColPartition* part = NULL;
1940  while ((part = gsearch.NextFullSearch()) != NULL) {
1941  color = default_color;
1942  if (part->type() == PT_TABLE)
1943  color = table_color;
1944 
1945  const TBOX& box = part->bounding_box();
1946  int left_x = box.left();
1947  int right_x = box.right();
1948  int top_y = box.top();
1949  int bottom_y = box.bottom();
1950  win->Brush(ScrollView::NONE);
1951  win->Pen(color);
1952  win->Rectangle(left_x, bottom_y, right_x, top_y);
1953  }
1954  win->UpdateWindow();
1955 #endif
1956 }
void Pen(Color color)
Definition: scrollview.cpp:726
inT16 right() const
Definition: rect.h:75
Definition: capi.h:78
inT16 left() const
Definition: rect.h:68
void Brush(Color color)
Definition: scrollview.cpp:732
void UpdateWindow()
Definition: scrollview.cpp:710
inT16 bottom() const
Definition: rect.h:61
Definition: rect.h:30
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
#define NULL
Definition: host.h:144
inT16 top() const
Definition: rect.h:54
void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1957 of file tablefind.cpp.

1959  {
1960  DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1961 }
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1929
void tesseract::TableFinder::DisplayColSegmentGrid ( ScrollView win,
ColSegmentGrid grid,
ScrollView::Color  color 
)
protected

Definition at line 1904 of file tablefind.cpp.

1905  {
1906 #ifndef GRAPHICS_DISABLED
1907  // Iterate the ColPartitions in the grid.
1908  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1909  gsearch(grid);
1910  gsearch.StartFullSearch();
1911  ColSegment* seg = NULL;
1912  while ((seg = gsearch.NextFullSearch()) != NULL) {
1913  const TBOX& box = seg->bounding_box();
1914  int left_x = box.left();
1915  int right_x = box.right();
1916  int top_y = box.top();
1917  int bottom_y = box.bottom();
1918  win->Brush(ScrollView::NONE);
1919  win->Pen(color);
1920  win->Rectangle(left_x, bottom_y, right_x, top_y);
1921  }
1922  win->UpdateWindow();
1923 #endif
1924 }
void Pen(Color color)
Definition: scrollview.cpp:726
inT16 right() const
Definition: rect.h:75
inT16 left() const
Definition: rect.h:68
void Brush(Color color)
Definition: scrollview.cpp:732
void UpdateWindow()
Definition: scrollview.cpp:710
inT16 bottom() const
Definition: rect.h:61
Definition: rect.h:30
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
#define NULL
Definition: host.h:144
inT16 top() const
Definition: rect.h:54
void tesseract::TableFinder::DisplayColSegments ( ScrollView win,
ColSegment_LIST *  cols,
ScrollView::Color  color 
)
protected

Definition at line 1884 of file tablefind.cpp.

1886  {
1887 #ifndef GRAPHICS_DISABLED
1888  win->Pen(color);
1889  win->Brush(ScrollView::NONE);
1890  ColSegment_IT it(segments);
1891  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1892  ColSegment* col = it.data();
1893  const TBOX& box = col->bounding_box();
1894  int left_x = box.left();
1895  int right_x = box.right();
1896  int top_y = box.top();
1897  int bottom_y = box.bottom();
1898  win->Rectangle(left_x, bottom_y, right_x, top_y);
1899  }
1900  win->UpdateWindow();
1901 #endif
1902 }
void Pen(Color color)
Definition: scrollview.cpp:726
inT16 right() const
Definition: rect.h:75
inT16 left() const
Definition: rect.h:68
void Brush(Color color)
Definition: scrollview.cpp:732
void UpdateWindow()
Definition: scrollview.cpp:710
inT16 bottom() const
Definition: rect.h:61
Definition: rect.h:30
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
inT16 top() const
Definition: rect.h:54
void tesseract::TableFinder::FilterFalseAlarms ( )
protected

Definition at line 1001 of file tablefind.cpp.

1001  {
1004  // TODO(nbeato): Fully justified text as non-table?
1005 }
void tesseract::TableFinder::FilterHeaderAndFooter ( )
protected

Definition at line 1087 of file tablefind.cpp.

1087  {
1088  // Consider top-most text colpartition as header and bottom most as footer
1089  ColPartition* header = NULL;
1090  ColPartition* footer = NULL;
1091  int max_top = MIN_INT32;
1092  int min_bottom = MAX_INT32;
1094  gsearch.StartFullSearch();
1095  ColPartition* part = NULL;
1096  while ((part = gsearch.NextFullSearch()) != NULL) {
1097  if (!part->IsTextType())
1098  continue; // Consider only text partitions
1099  int top = part->bounding_box().top();
1100  int bottom = part->bounding_box().bottom();
1101  if (top > max_top) {
1102  max_top = top;
1103  header = part;
1104  }
1105  if (bottom < min_bottom) {
1106  min_bottom = bottom;
1107  footer = part;
1108  }
1109  }
1110  if (header)
1111  header->clear_table_type();
1112  if (footer)
1113  footer->clear_table_type();
1114 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
#define MIN_INT32
Definition: host.h:128
#define MAX_INT32
Definition: host.h:120
#define NULL
Definition: host.h:144
void tesseract::TableFinder::FilterParagraphEndings ( )
protected

Definition at line 1007 of file tablefind.cpp.

1007  {
1008  // Detect last line of paragraph
1009  // Iterate the ColPartitions in the grid.
1011  gsearch.StartFullSearch();
1012  ColPartition* part = NULL;
1013  while ((part = gsearch.NextFullSearch()) != NULL) {
1014  if (part->type() != PT_TABLE)
1015  continue; // Consider only table partitions
1016 
1017  // Paragraph ending should have flowing text above it.
1018  ColPartition* upper_part = part->nearest_neighbor_above();
1019  if (!upper_part)
1020  continue;
1021  if (upper_part->type() != PT_FLOWING_TEXT)
1022  continue;
1023  if (upper_part->bounding_box().width() <
1024  2 * part->bounding_box().width())
1025  continue;
1026  // Check if its the last line of a paragraph.
1027  // In most cases, a paragraph ending should be left-aligned to text line
1028  // above it. Sometimes, it could be a 2 line paragraph, in which case
1029  // the line above it is indented.
1030  // To account for that, check if the partition center is to
1031  // the left of the one above it.
1032  int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1033  int upper_mid = (upper_part->bounding_box().left() +
1034  upper_part->bounding_box().right()) / 2;
1035  int current_spacing = 0; // spacing of the current line to margin
1036  int upper_spacing = 0; // spacing of the previous line to the margin
1038  // Left to right languages, use mid - left to figure out the distance
1039  // the middle is from the left margin.
1040  int left = MIN(part->bounding_box().left(),
1041  upper_part->bounding_box().left());
1042  current_spacing = mid - left;
1043  upper_spacing = upper_mid - left;
1044  } else {
1045  // Right to left languages, use right - mid to figure out the distance
1046  // the middle is from the right margin.
1047  int right = MAX(part->bounding_box().right(),
1048  upper_part->bounding_box().right());
1049  current_spacing = right - mid;
1050  upper_spacing = right - upper_mid;
1051  }
1052  if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing)
1053  continue;
1054 
1055  // Paragraphs should have similar fonts.
1056  if (!part->MatchingSizes(*upper_part) ||
1057  !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
1059  continue;
1060  }
1061 
1062  // The last line of a paragraph should be left aligned.
1063  // TODO(nbeato): This would be untrue if the text was right aligned.
1064  // How often is that?
1065  if (part->space_to_left() >
1066  kMaxParagraphEndingLeftSpaceMultiple * part->median_size())
1067  continue;
1068  // The line above it should be right aligned (assuming justified format).
1069  // Since we can't assume justified text, we compare whitespace to text.
1070  // The above line should have majority spanning text (or the current
1071  // line could have fit on the previous line). So compare
1072  // whitespace to text.
1073  if (upper_part->bounding_box().width() <
1074  kMinParagraphEndingTextToWhitespaceRatio * upper_part->space_to_right())
1075  continue;
1076 
1077  // Ledding above the line should be less than ledding below
1078  if (part->space_above() >= part->space_below() ||
1079  part->space_above() > 2 * global_median_ledding_)
1080  continue;
1081 
1082  // If all checks failed, it is probably text.
1083  part->clear_table_type();
1084  }
1085 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define MAX(x, y)
Definition: ndminx.h:24
#define MIN(x, y)
Definition: ndminx.h:28
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
const double kStrokeWidthFractionalTolerance
Definition: tablefind.cpp:148
Definition: capi.h:78
const double kParagraphEndingPreviousLineRatio
Definition: tablefind.cpp:130
const double kMinParagraphEndingTextToWhitespaceRatio
Definition: tablefind.cpp:140
#define NULL
Definition: host.h:144
const double kMaxParagraphEndingLeftSpaceMultiple
Definition: tablefind.cpp:134
const double kStrokeWidthConstantTolerance
void tesseract::TableFinder::FindNeighbors ( )
protected

Definition at line 779 of file tablefind.cpp.

779  {
781  gsearch.StartFullSearch();
782  ColPartition* part = NULL;
783  while ((part = gsearch.NextFullSearch()) != NULL) {
784  // TODO(nbeato): Rename this function, meaning is different now.
785  // IT is finding nearest neighbors its own way
786  //SetVerticalSpacing(part);
787 
788  ColPartition* upper = part->SingletonPartner(true);
789  if (upper)
790  part->set_nearest_neighbor_above(upper);
791 
792  ColPartition* lower = part->SingletonPartner(false);
793  if (lower)
794  part->set_nearest_neighbor_below(lower);
795  }
796 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
#define NULL
Definition: host.h:144
bool tesseract::TableFinder::GapInXProjection ( int *  xprojection,
int  length 
)
protected

Definition at line 1781 of file tablefind.cpp.

1781  {
1782  // Find peak value of the histogram
1783  int peak_value = 0;
1784  for (int i = 0; i < length; i++) {
1785  if (xprojection[i] > peak_value) {
1786  peak_value = xprojection[i];
1787  }
1788  }
1789  // Peak value represents the maximum number of horizontally
1790  // overlapping colpartitions, so this can be considered as the
1791  // number of rows in the table
1792  if (peak_value < kMinRowsInTable)
1793  return false;
1794  double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1795  if (peak_value >= kLargeTableRowCount)
1796  projection_threshold = kLargeTableProjectionThreshold * peak_value;
1797  // Threshold the histogram
1798  for (int i = 0; i < length; i++) {
1799  xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1800  }
1801  // Find the largest run of zeros between two ones
1802  int largest_gap = 0;
1803  int run_start = -1;
1804  for (int i = 1; i < length; i++) {
1805  // detect start of a run of zeros
1806  if (xprojection[i - 1] && !xprojection[i]) {
1807  run_start = i;
1808  }
1809  // detect end of a run of zeros and update the value of largest gap
1810  if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1811  int gap = i - run_start;
1812  if (gap > largest_gap)
1813  largest_gap = gap;
1814  run_start = -1;
1815  }
1816  }
1817  return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1818 }
const int kLargeTableRowCount
Definition: tablefind.cpp:112
const double kSmallTableProjectionThreshold
Definition: tablefind.cpp:109
const double kMaxXProjectionGapFactor
Definition: tablefind.cpp:144
const int kMinRowsInTable
Definition: tablefind.cpp:115
const double kLargeTableProjectionThreshold
Definition: tablefind.cpp:110
void tesseract::TableFinder::GetColumnBlocks ( ColPartitionSet **  columns,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 536 of file tablefind.cpp.

537  {
538  for (int i = 0; i < gridheight(); ++i) {
539  ColPartitionSet* columns = all_columns[i];
540  if (columns != NULL) {
541  ColSegment_LIST new_blocks;
542  // Get boxes from the current vertical position on the grid
543  columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks);
544  // Merge the new_blocks boxes into column_blocks if they are well-aligned
545  GroupColumnBlocks(&new_blocks, column_blocks);
546  }
547  }
548 }
int gridheight() const
Definition: tablefind.cpp:397
int gridsize() const
Definition: tablefind.cpp:391
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:551
#define NULL
Definition: host.h:144
void tesseract::TableFinder::GetTableColumns ( ColSegment_LIST *  table_columns)
protected

Definition at line 1286 of file tablefind.cpp.

1286  {
1287  ColSegment_IT it(table_columns);
1288  // Iterate the ColPartitions in the grid.
1289  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1290  gsearch(&clean_part_grid_);
1291  gsearch.StartFullSearch();
1292  ColPartition* part;
1293  while ((part = gsearch.NextFullSearch()) != NULL) {
1294  if (part->inside_table_column() || part->type() != PT_TABLE)
1295  continue; // prevent a partition to be assigned to multiple columns
1296  const TBOX& box = part->bounding_box();
1297  ColSegment* col = new ColSegment();
1298  col->InsertBox(box);
1299  part->set_inside_table_column(true);
1300  // Start a search below the current cell to find bottom neighbours
1301  // Note: a full search will always process things above it first, so
1302  // this should be starting at the highest cell and working its way down.
1303  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1304  vsearch(&clean_part_grid_);
1305  vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1306  ColPartition* neighbor = NULL;
1307  bool found_neighbours = false;
1308  while ((neighbor = vsearch.NextVerticalSearch(true)) != NULL) {
1309  // only consider neighbors not assigned to any column yet
1310  if (neighbor->inside_table_column())
1311  continue;
1312  // Horizontal lines should not break the flow
1313  if (neighbor->IsHorizontalLine())
1314  continue;
1315  // presence of a non-table neighbor marks the end of current
1316  // table column
1317  if (neighbor->type() != PT_TABLE)
1318  break;
1319  // add the neighbor partition to the table column
1320  const TBOX& neighbor_box = neighbor->bounding_box();
1321  col->InsertBox(neighbor_box);
1322  neighbor->set_inside_table_column(true);
1323  found_neighbours = true;
1324  }
1325  if (found_neighbours) {
1326  it.add_after_then_move(col);
1327  } else {
1328  part->set_inside_table_column(false);
1329  delete col;
1330  }
1331  }
1332 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
inT16 right() const
Definition: rect.h:75
Definition: capi.h:78
inT16 left() const
Definition: rect.h:68
inT16 bottom() const
Definition: rect.h:61
Definition: rect.h:30
#define NULL
Definition: host.h:144
void tesseract::TableFinder::GetTableRegions ( ColSegment_LIST *  table_columns,
ColSegment_LIST *  table_regions 
)
protected

Definition at line 1336 of file tablefind.cpp.

1337  {
1338  ColSegment_IT cit(table_columns);
1339  ColSegment_IT rit(table_regions);
1340  // Iterate through column blocks
1341  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1342  gsearch(&col_seg_grid_);
1343  gsearch.StartFullSearch();
1344  ColSegment* part;
1345  int page_height = tright().y() - bleft().y();
1346  ASSERT_HOST(page_height > 0);
1347  // create a bool array to hold projection on y-axis
1348  bool* table_region = new bool[page_height];
1349  while ((part = gsearch.NextFullSearch()) != NULL) {
1350  TBOX part_box = part->bounding_box();
1351  // reset the projection array
1352  for (int i = 0; i < page_height; i++) {
1353  table_region[i] = false;
1354  }
1355  // iterate through all table columns to find regions in the current
1356  // page column block
1357  cit.move_to_first();
1358  for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1359  TBOX col_box = cit.data()->bounding_box();
1360  // find intersection region of table column and page column
1361  TBOX intersection_box = col_box.intersection(part_box);
1362  // project table column on the y-axis
1363  for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1364  table_region[i - bleft().y()] = true;
1365  }
1366  }
1367  // set x-limits of table regions to page column width
1368  TBOX current_table_box;
1369  current_table_box.set_left(part_box.left());
1370  current_table_box.set_right(part_box.right());
1371  // go through the y-axis projection to find runs of table
1372  // regions. Each run makes one table region.
1373  for (int i = 1; i < page_height; i++) {
1374  // detect start of a table region
1375  if (!table_region[i - 1] && table_region[i]) {
1376  current_table_box.set_bottom(i + bleft().y());
1377  }
1378  // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1379  // detect end of a table region
1380  if (table_region[i - 1] && !table_region[i]) {
1381  current_table_box.set_top(i + bleft().y());
1382  if (!current_table_box.null_box()) {
1383  ColSegment* seg = new ColSegment();
1384  seg->InsertBox(current_table_box);
1385  rit.add_after_then_move(seg);
1386  }
1387  }
1388  }
1389  }
1390  delete[] table_region;
1391 }
void set_right(int x)
Definition: rect.h:78
const ICOORD & bleft() const
Definition: tablefind.cpp:400
inT16 right() const
Definition: rect.h:75
bool null_box() const
Definition: rect.h:46
void set_left(int x)
Definition: rect.h:71
#define ASSERT_HOST(x)
Definition: errcode.h:84
const ICOORD & tright() const
Definition: tablefind.cpp:403
void set_bottom(int y)
Definition: rect.h:64
inT16 y() const
access_function
Definition: points.h:56
inT16 left() const
Definition: rect.h:68
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
inT16 bottom() const
Definition: rect.h:61
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
Definition: rect.h:30
#define NULL
Definition: host.h:144
void set_top(int y)
Definition: rect.h:57
inT16 top() const
Definition: rect.h:54
int tesseract::TableFinder::gridheight ( ) const
protected

Definition at line 397 of file tablefind.cpp.

397  {
398  return clean_part_grid_.gridheight();
399 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
int gridheight() const
Definition: bbgrid.h:69
void tesseract::TableFinder::GridMergeColumnBlocks ( )
protected

Definition at line 1208 of file tablefind.cpp.

1208  {
1209  int margin = gridsize();
1210 
1211  // Iterate the Column Blocks in the grid.
1212  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1213  gsearch(&col_seg_grid_);
1214  gsearch.StartFullSearch();
1215  ColSegment* seg;
1216  while ((seg = gsearch.NextFullSearch()) != NULL) {
1217  if (seg->type() != COL_TEXT)
1218  continue; // only consider text blocks for split detection
1219  bool neighbor_found = false;
1220  bool modified = false; // Modified at least once
1221  // keep expanding current box as long as neighboring table columns
1222  // are found above or below it.
1223  do {
1224  TBOX box = seg->bounding_box();
1225  // slightly expand the search region vertically
1226  int top_range = MIN(box.top() + margin, tright().y());
1227  int bottom_range = MAX(box.bottom() - margin, bleft().y());
1228  box.set_top(top_range);
1229  box.set_bottom(bottom_range);
1230  neighbor_found = false;
1231  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1232  rectsearch(&col_seg_grid_);
1233  rectsearch.StartRectSearch(box);
1234  ColSegment* neighbor = NULL;
1235  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
1236  if (neighbor == seg)
1237  continue;
1238  const TBOX& neighbor_box = neighbor->bounding_box();
1239  // If the neighbor box significantly overlaps with the current
1240  // box (due to the expansion of the current box in the
1241  // previous iteration of this loop), remove the neighbor box
1242  // and expand the current box to include it.
1243  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1244  seg->InsertBox(neighbor_box);
1245  modified = true;
1246  rectsearch.RemoveBBox();
1247  gsearch.RepositionIterator();
1248  delete neighbor;
1249  continue;
1250  }
1251  // Only expand if the neighbor box is of table type
1252  if (neighbor->type() != COL_TABLE)
1253  continue;
1254  // Insert the neighbor box into the current column block
1255  if (neighbor_box.major_x_overlap(box) &&
1256  !box.contains(neighbor_box)) {
1257  seg->InsertBox(neighbor_box);
1258  neighbor_found = true;
1259  modified = true;
1260  rectsearch.RemoveBBox();
1261  gsearch.RepositionIterator();
1262  delete neighbor;
1263  }
1264  }
1265  } while (neighbor_found);
1266  if (modified) {
1267  // Because the box has changed, it has to be removed first.
1268  gsearch.RemoveBBox();
1269  col_seg_grid_.InsertBBox(true, true, seg);
1270  gsearch.RepositionIterator();
1271  }
1272  }
1273 }
#define MAX(x, y)
Definition: ndminx.h:24
#define MIN(x, y)
Definition: ndminx.h:28
int gridsize() const
Definition: tablefind.cpp:391
const ICOORD & bleft() const
Definition: tablefind.cpp:400
const ICOORD & tright() const
Definition: tablefind.cpp:403
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
void set_bottom(int y)
Definition: rect.h:64
inT16 y() const
access_function
Definition: points.h:56
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
inT16 bottom() const
Definition: rect.h:61
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
Definition: rect.h:30
bool contains(const FCOORD pt) const
Definition: rect.h:323
#define NULL
Definition: host.h:144
void set_top(int y)
Definition: rect.h:57
inT16 top() const
Definition: rect.h:54
void tesseract::TableFinder::GridMergeTableRegions ( )
protected

Definition at line 1399 of file tablefind.cpp.

1399  {
1400  // Iterate the table regions in the grid.
1401  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1402  gsearch(&table_grid_);
1403  gsearch.StartFullSearch();
1404  ColSegment* seg = NULL;
1405  while ((seg = gsearch.NextFullSearch()) != NULL) {
1406  bool neighbor_found = false;
1407  bool modified = false; // Modified at least once
1408  do {
1409  // Start a rectangle search x-bounded by the image and y by the table
1410  const TBOX& box = seg->bounding_box();
1411  TBOX search_region(box);
1412  search_region.set_left(bleft().x());
1413  search_region.set_right(tright().x());
1414  neighbor_found = false;
1415  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1416  rectsearch(&table_grid_);
1417  rectsearch.StartRectSearch(search_region);
1418  ColSegment* neighbor = NULL;
1419  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
1420  if (neighbor == seg)
1421  continue;
1422  const TBOX& neighbor_box = neighbor->bounding_box();
1423  // Check if a neighbor box has a large overlap with the table
1424  // region. This may happen as a result of merging two table
1425  // regions in the previous iteration.
1426  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1427  seg->InsertBox(neighbor_box);
1428  rectsearch.RemoveBBox();
1429  gsearch.RepositionIterator();
1430  delete neighbor;
1431  modified = true;
1432  continue;
1433  }
1434  // Check if two table regions belong together based on a common
1435  // horizontal ruling line
1436  if (BelongToOneTable(box, neighbor_box)) {
1437  seg->InsertBox(neighbor_box);
1438  neighbor_found = true;
1439  modified = true;
1440  rectsearch.RemoveBBox();
1441  gsearch.RepositionIterator();
1442  delete neighbor;
1443  }
1444  }
1445  } while (neighbor_found);
1446  if (modified) {
1447  // Because the box has changed, it has to be removed first.
1448  gsearch.RemoveBBox();
1449  table_grid_.InsertBBox(true, true, seg);
1450  gsearch.RepositionIterator();
1451  }
1452  }
1453 }
const ICOORD & bleft() const
Definition: tablefind.cpp:400
const ICOORD & tright() const
Definition: tablefind.cpp:403
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
Definition: tablefind.cpp:1457
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
Definition: rect.h:30
ColSegmentGrid table_grid_
Definition: tablefind.h:428
#define NULL
Definition: host.h:144
int tesseract::TableFinder::gridsize ( ) const
protected

Definition at line 391 of file tablefind.cpp.

391  {
392  return clean_part_grid_.gridsize();
393 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
int gridsize() const
Definition: bbgrid.h:63
int tesseract::TableFinder::gridwidth ( ) const
protected

Definition at line 394 of file tablefind.cpp.

394  {
395  return clean_part_grid_.gridwidth();
396 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
int gridwidth() const
Definition: bbgrid.h:66
void tesseract::TableFinder::GroupColumnBlocks ( ColSegment_LIST *  current_segments,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 551 of file tablefind.cpp.

552  {
553  ColSegment_IT src_it(new_blocks);
554  ColSegment_IT dest_it(column_blocks);
555  // iterate through the source list
556  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
557  ColSegment* src_seg = src_it.data();
558  TBOX src_box = src_seg->bounding_box();
559  bool match_found = false;
560  // iterate through the destination list to find a matching column block
561  for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
562  ColSegment* dest_seg = dest_it.data();
563  TBOX dest_box = dest_seg->bounding_box();
564  if (ConsecutiveBoxes(src_box, dest_box)) {
565  // If matching block is found, insert the current block into it
566  // and delete the soure block
567  dest_seg->InsertBox(src_box);
568  match_found = true;
569  delete src_it.extract();
570  break;
571  }
572  }
573  // If no match is found, just append the source block to column_blocks
574  if (!match_found) {
575  dest_it.add_after_then_move(src_it.extract());
576  }
577  }
578 }
Definition: rect.h:30
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
Definition: tablefind.cpp:581
void tesseract::TableFinder::GrowTableBox ( const TBOX table_box,
TBOX result_box 
)
protected

Definition at line 1533 of file tablefind.cpp.

1533  {
1534  // TODO(nbeato): The growing code is a bit excessive right now.
1535  // By removing these lines, the partitions considered need
1536  // to have some overlap or be special cases. These lines could
1537  // be added again once a check is put in place to make sure that
1538  // growing tables don't stomp on a lot of non-table partitions.
1539 
1540  // search for horizontal ruling lines within the vertical margin
1541  // int vertical_margin = kRulingVerticalMargin * gridsize();
1542  TBOX search_box = table_box;
1543  // int top = MIN(search_box.top() + vertical_margin, tright().y());
1544  // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1545  // search_box.set_top(top);
1546  // search_box.set_bottom(bottom);
1547 
1548  GrowTableToIncludePartials(table_box, search_box, result_box);
1549  GrowTableToIncludeLines(table_box, search_box, result_box);
1550  IncludeLeftOutColumnHeaders(result_box);
1551 }
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1583
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1555
Definition: rect.h:30
void IncludeLeftOutColumnHeaders(TBOX *table_box)
Definition: tablefind.cpp:1677
void tesseract::TableFinder::GrowTableToIncludeLines ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1583 of file tablefind.cpp.

1585  {
1587  rsearch.SetUniqueMode(true);
1588  rsearch.StartRectSearch(search_range);
1589  ColPartition* part = NULL;
1590  while ((part = rsearch.NextRectSearch()) != NULL) {
1591  // TODO(nbeato) This should also do vertical, but column
1592  // boundaries are breaking things. This function needs to be
1593  // updated to allow vertical lines as well.
1594  if (!part->IsLineType())
1595  continue;
1596  // Avoid the following function call if the result of the
1597  // function is irrelevant.
1598  const TBOX& part_box = part->bounding_box();
1599  if (result_box->contains(part_box))
1600  continue;
1601  // Include a partially overlapping horizontal line only if the
1602  // extra ColPartitions that will be included due to expansion
1603  // have large side spacing w.r.t. columns containing them.
1604  if (HLineBelongsToTable(*part, table_box))
1605  *result_box = result_box->bounding_union(part_box);
1606  // TODO(nbeato): Vertical
1607  }
1608 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
Definition: tablefind.cpp:1613
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
Definition: rect.h:30
bool contains(const FCOORD pt) const
Definition: rect.h:323
#define NULL
Definition: host.h:144
void tesseract::TableFinder::GrowTableToIncludePartials ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1555 of file tablefind.cpp.

1557  {
1558  // Rulings are in a different grid, so search 2 grids for rulings, text,
1559  // and table partitions that are not entirely within the new box.
1560  for (int i = 0; i < 2; ++i) {
1561  ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ :
1563  ColPartitionGridSearch rectsearch(grid);
1564  rectsearch.StartRectSearch(search_range);
1565  ColPartition* part = NULL;
1566  while ((part = rectsearch.NextRectSearch()) != NULL) {
1567  // Only include text and table types.
1568  if (part->IsImageType())
1569  continue;
1570  const TBOX& part_box = part->bounding_box();
1571  // Include partition in the table if more than half of it
1572  // is covered by the table
1573  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1574  *result_box = result_box->bounding_union(part_box);
1575  continue;
1576  }
1577  }
1578  }
1579 }
const double kMinOverlapWithTable
Definition: tablefind.cpp:100
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
Definition: rect.h:30
#define NULL
Definition: host.h:144
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
bool tesseract::TableFinder::HasLeaderAdjacent ( const ColPartition part)
protected

Definition at line 959 of file tablefind.cpp.

959  {
960  if (part.flow() == BTFT_LEADER)
961  return true;
962  // Search range is left and right bounded by an offset of the
963  // median xheight. This offset is to allow some tolerance to the
964  // the leaders on the page in the event that the alignment is still
965  // a bit off.
966  const TBOX& box = part.bounding_box();
967  const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
968  const int top = box.top() + search_size;
969  const int bottom = box.bottom() - search_size;
971  for (int direction = 0; direction < 2; ++direction) {
972  bool right_to_left = (direction == 0);
973  int x = right_to_left ? box.right() : box.left();
974  hsearch.StartSideSearch(x, bottom, top);
975  ColPartition* leader = NULL;
976  while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) {
977  // The leader could be a horizontal ruling in the grid.
978  // Make sure it is actually a leader.
979  if (leader->flow() != BTFT_LEADER)
980  continue;
981  // This should not happen, they are in different grids.
982  ASSERT_HOST(&part != leader);
983  // Make sure the leader shares a page column with the partition,
984  // otherwise we are spreading across columns.
985  if (!part.IsInSameColumnAs(*leader))
986  break;
987  // There should be a significant vertical overlap
988  if (!leader->VSignificantCoreOverlap(part))
989  continue;
990  // Leader passed all tests, so it is adjacent.
991  return true;
992  }
993  }
994  // No leaders are adjacent to the given partition.
995  return false;
996 }
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
inT16 right() const
Definition: rect.h:75
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
inT16 bottom() const
Definition: rect.h:61
Definition: rect.h:30
const int kAdjacentLeaderSearchPadding
Definition: tablefind.cpp:125
#define NULL
Definition: host.h:144
inT16 top() const
Definition: rect.h:54
bool tesseract::TableFinder::HasWideOrNoInterWordGap ( ColPartition part) const
protected

Definition at line 870 of file tablefind.cpp.

870  {
871  // Should only get text partitions.
872  ASSERT_HOST(part->IsTextType());
873  // Blob access
874  BLOBNBOX_CLIST* part_boxes = part->boxes();
875  BLOBNBOX_C_IT it(part_boxes);
876  // Check if this is a relatively small partition (such as a single word)
877  if (part->bounding_box().width() <
878  kMinBoxesInTextPartition * part->median_size() &&
879  part_boxes->length() < kMinBoxesInTextPartition)
880  return true;
881 
882  // Variables used to compute inter-blob spacing.
883  int current_x0 = -1;
884  int current_x1 = -1;
885  int previous_x1 = -1;
886  // Stores the maximum gap detected.
887  int largest_partition_gap_found = -1;
888  // Text partition gap limits. If this is text (and not a table),
889  // there should be at least one gap larger than min_gap and no gap
890  // larger than max_gap.
891  const double max_gap = kMaxGapInTextPartition * part->median_size();
892  const double min_gap = kMinMaxGapInTextPartition * part->median_size();
893 
894  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
895  BLOBNBOX* blob = it.data();
896  current_x0 = blob->bounding_box().left();
897  current_x1 = blob->bounding_box().right();
898  if (previous_x1 != -1) {
899  int gap = current_x0 - previous_x1;
900 
901  // TODO(nbeato): Boxes may overlap? Huh?
902  // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
903  // on the top right of the page are filtered out with this line.
904  // Note 2: Iterating over blobs in a partition, so we are looking for
905  // spacing between the words.
906  if (gap < 0) {
907  // More likely case, the blobs slightly overlap. This can happen
908  // with diacritics (accents) or broken alphabet symbols (characters).
909  // Merge boxes together by taking max of right sides.
910  if (-gap < part->median_size() * kMaxBlobOverlapFactor) {
911  previous_x1 = MAX(previous_x1, current_x1);
912  continue;
913  }
914  // Extreme case, blobs overlap significantly in the same partition...
915  // This should not happen often (if at all), but it does.
916  // TODO(nbeato): investigate cases when this happens.
917  else {
918  // The behavior before was to completely ignore this case.
919  }
920  }
921 
922  // If a large enough gap is found, mark it as a table cell (return true)
923  if (gap > max_gap)
924  return true;
925  if (gap > largest_partition_gap_found)
926  largest_partition_gap_found = gap;
927  }
928  previous_x1 = current_x1;
929  }
930  // Since no large gap was found, return false if the partition is too
931  // long to be a data cell
932  if (part->bounding_box().width() >
933  kMaxBoxesInDataPartition * part->median_size() ||
934  part_boxes->length() > kMaxBoxesInDataPartition)
935  return false;
936 
937  // A partition may be a single blob. In this case, it's an isolated symbol
938  // or non-text (such as a ruling or image).
939  // Detect these as table partitions? Shouldn't this be case by case?
940  // The behavior before was to ignore this, making max_partition_gap < 0
941  // and implicitly return true. Just making it explicit.
942  if (largest_partition_gap_found == -1)
943  return true;
944 
945  // return true if the maximum gap found is smaller than the minimum allowed
946  // max_gap in a text partition. This indicates that there is no signficant
947  // space in the partition, hence it is likely a single word.
948  return largest_partition_gap_found < min_gap;
949 }
#define MAX(x, y)
Definition: ndminx.h:24
const double kMaxBlobOverlapFactor
Definition: tablefind.cpp:80
const double kMinMaxGapInTextPartition
Definition: tablefind.cpp:76
inT16 right() const
Definition: rect.h:75
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
const double kMaxGapInTextPartition
Definition: tablefind.cpp:72
const int kMaxBoxesInDataPartition
Definition: tablefind.cpp:69
const TBOX & bounding_box() const
Definition: blobbox.h:215
const int kMinBoxesInTextPartition
Definition: tablefind.cpp:66
bool tesseract::TableFinder::HLineBelongsToTable ( const ColPartition part,
const TBOX table_box 
)
protected

Definition at line 1613 of file tablefind.cpp.

1614  {
1615  if (!part.IsHorizontalLine())
1616  return false;
1617  const TBOX& part_box = part.bounding_box();
1618  if (!part_box.major_x_overlap(table_box))
1619  return false;
1620  // Do not consider top-most horizontal line since it usually
1621  // originates from noise.
1622  // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1623  // have neighbors solved.
1624  // if (!part.nearest_neighbor_above())
1625  // return false;
1626  const TBOX bbox = part_box.bounding_union(table_box);
1627  // In the "unioned table" box (the table extents expanded by the line),
1628  // keep track of how many partitions have significant padding to the left
1629  // and right. If more than half of the partitions covered by the new table
1630  // have significant spacing, the line belongs to the table and the table
1631  // grows to include all of the partitions.
1632  int num_extra_partitions = 0;
1633  int extra_space_to_right = 0;
1634  int extra_space_to_left = 0;
1635  // Rulings are in a different grid, so search 2 grids for rulings, text,
1636  // and table partitions that are introduced by the new box.
1637  for (int i = 0; i < 2; ++i) {
1638  ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ :
1640  // Start a rect search on bbox
1641  ColPartitionGridSearch rectsearch(grid);
1642  rectsearch.SetUniqueMode(true);
1643  rectsearch.StartRectSearch(bbox);
1644  ColPartition* extra_part = NULL;
1645  while ((extra_part = rectsearch.NextRectSearch()) != NULL) {
1646  // ColPartition already in table
1647  const TBOX& extra_part_box = extra_part->bounding_box();
1648  if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable)
1649  continue;
1650  // Non-text ColPartitions do not contribute
1651  if (extra_part->IsImageType())
1652  continue;
1653  // Consider this partition.
1654  num_extra_partitions++;
1655  // presence of a table cell is a strong hint, so just increment the scores
1656  // without looking at the spacing.
1657  if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1658  extra_space_to_right++;
1659  extra_space_to_left++;
1660  continue;
1661  }
1662  int space_threshold = kSideSpaceMargin * part.median_size();
1663  if (extra_part->space_to_right() > space_threshold)
1664  extra_space_to_right++;
1665  if (extra_part->space_to_left() > space_threshold)
1666  extra_space_to_left++;
1667  }
1668  }
1669  // tprintf("%d %d %d\n",
1670  // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1671  return (extra_space_to_right > num_extra_partitions / 2) ||
1672  (extra_space_to_left > num_extra_partitions / 2);
1673 }
const double kMinOverlapWithTable
Definition: tablefind.cpp:100
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
Definition: capi.h:78
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
Definition: rect.h:30
#define NULL
Definition: host.h:144
const int kSideSpaceMargin
Definition: tablefind.cpp:105
void tesseract::TableFinder::IncludeLeftOutColumnHeaders ( TBOX table_box)
protected

Definition at line 1677 of file tablefind.cpp.

1677  {
1678  // Start a search above the current table to look for column headers
1680  vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1681  table_box->top());
1682  ColPartition* neighbor = NULL;
1683  ColPartition* previous_neighbor = NULL;
1684  while ((neighbor = vsearch.NextVerticalSearch(false)) != NULL) {
1685  // Max distance to find a table heading.
1686  const int max_distance = kMaxColumnHeaderDistance *
1687  neighbor->median_size();
1688  int table_top = table_box->top();
1689  const TBOX& box = neighbor->bounding_box();
1690  // Do not continue if the next box is way above
1691  if (box.bottom() - table_top > max_distance)
1692  break;
1693  // Unconditionally include partitions of type TABLE or LINE
1694  // TODO(faisal): add some reasonable conditions here
1695  if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1696  table_box->set_top(box.top());
1697  previous_neighbor = NULL;
1698  continue;
1699  }
1700  // If there are two text partitions, one above the other, without a table
1701  // cell on their left or right side, consider them a barrier and quit
1702  if (previous_neighbor == NULL) {
1703  previous_neighbor = neighbor;
1704  } else {
1705  const TBOX& previous_box = previous_neighbor->bounding_box();
1706  if (!box.major_y_overlap(previous_box))
1707  break;
1708  }
1709  }
1710 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
inT16 right() const
Definition: rect.h:75
Definition: capi.h:78
inT16 left() const
Definition: rect.h:68
const int kMaxColumnHeaderDistance
Definition: tablefind.cpp:88
inT16 bottom() const
Definition: rect.h:61
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:429
Definition: rect.h:30
#define NULL
Definition: host.h:144
void set_top(int y)
Definition: rect.h:57
inT16 top() const
Definition: rect.h:54
void tesseract::TableFinder::Init ( int  grid_size,
const ICOORD bottom_left,
const ICOORD top_right 
)

Definition at line 191 of file tablefind.cpp.

192  {
193  // Initialize clean partitions list and grid
194  clean_part_grid_.Init(grid_size, bottom_left, top_right);
195  leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
196  fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
197  col_seg_grid_.Init(grid_size, bottom_left, top_right);
198  table_grid_.Init(grid_size, bottom_left, top_right);
199 }
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:447
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
ColSegmentGrid table_grid_
Definition: tablefind.h:428
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
void tesseract::TableFinder::InitializePartitions ( ColPartitionSet **  all_columns)
protected

Definition at line 592 of file tablefind.cpp.

592  {
593  FindNeighbors();
594  SetPartitionSpacings(&clean_part_grid_, all_columns);
596 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
Definition: tablefind.cpp:599
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:722
void tesseract::TableFinder::InsertCleanPartitions ( ColPartitionGrid grid,
TO_BLOCK block 
)

Definition at line 203 of file tablefind.cpp.

204  {
205  // Calculate stats. This lets us filter partitions in AllowTextPartition()
206  // and filter blobs in AllowBlob().
207  SetGlobalSpacings(grid);
208 
209  // Iterate the ColPartitions in the grid.
210  ColPartitionGridSearch gsearch(grid);
211  gsearch.SetUniqueMode(true);
212  gsearch.StartFullSearch();
213  ColPartition* part = NULL;
214  while ((part = gsearch.NextFullSearch()) != NULL) {
215  // Reject partitions with nothing useful inside of them.
216  if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0)
217  continue;
218  ColPartition* clean_part = part->ShallowCopy();
219  ColPartition* leader_part = NULL;
220  if (part->IsLineType()) {
221  InsertRulingPartition(clean_part);
222  continue;
223  }
224  // Insert all non-text partitions to clean_parts
225  if (!part->IsTextType()) {
226  InsertImagePartition(clean_part);
227  continue;
228  }
229  // Insert text colpartitions after removing noisy components from them
230  // The leaders are split into a separate grid.
231  BLOBNBOX_CLIST* part_boxes = part->boxes();
232  BLOBNBOX_C_IT pit(part_boxes);
233  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
234  BLOBNBOX *pblob = pit.data();
235  // Bad blobs... happens in UNLV set.
236  // news.3G1, page 17 (around x=6)
237  if (!AllowBlob(*pblob))
238  continue;
239  if (pblob->flow() == BTFT_LEADER) {
240  if (leader_part == NULL) {
241  leader_part = part->ShallowCopy();
242  leader_part->set_flow(BTFT_LEADER);
243  }
244  leader_part->AddBox(pblob);
245  } else if (pblob->region_type() != BRT_NOISE) {
246  clean_part->AddBox(pblob);
247  }
248  }
249  clean_part->ComputeLimits();
250  ColPartition* fragmented = clean_part->CopyButDontOwnBlobs();
251  InsertTextPartition(clean_part);
253  if (leader_part != NULL) {
254  // TODO(nbeato): Note that ComputeLimits does not update the column
255  // information. So the leader may appear to span more columns than it
256  // really does later on when IsInSameColumnAs gets called to test
257  // for adjacent leaders.
258  leader_part->ComputeLimits();
259  InsertLeaderPartition(leader_part);
260  }
261  }
262 
263  // Make the partition partners better for upper and lower neighbors.
266 }
void InsertRulingPartition(ColPartition *part)
Definition: tablefind.cpp:431
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:449
void InsertImagePartition(ColPartition *part)
Definition: tablefind.cpp:434
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:283
BlobRegionType region_type() const
Definition: blobbox.h:268
bool AllowBlob(const BLOBNBOX &blob) const
Definition: tablefind.cpp:515
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:722
void InsertLeaderPartition(ColPartition *part)
Definition: tablefind.cpp:423
#define NULL
Definition: host.h:144
BlobTextFlowType flow() const
Definition: blobbox.h:280
void InsertTextPartition(ColPartition *part)
Definition: tablefind.cpp:407
void RefinePartitionPartners(bool get_desperate)
void tesseract::TableFinder::InsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 415 of file tablefind.cpp.

415  {
416  ASSERT_HOST(part != NULL);
417  if (AllowTextPartition(*part)) {
418  fragmented_text_grid_.InsertBBox(true, true, part);
419  } else {
420  delete part;
421  }
422 }
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:502
#define ASSERT_HOST(x)
Definition: errcode.h:84
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
#define NULL
Definition: host.h:144
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
void tesseract::TableFinder::InsertImagePartition ( ColPartition part)
protected

Definition at line 434 of file tablefind.cpp.

434  {
435  // NOTE: If images are placed into a different grid in the future,
436  // the function SetPartitionSpacings needs to be updated. It should
437  // be the only thing that cares about image partitions.
438  clean_part_grid_.InsertBBox(true, true, part);
439 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
void tesseract::TableFinder::InsertLeaderPartition ( ColPartition part)
protected

Definition at line 423 of file tablefind.cpp.

423  {
424  ASSERT_HOST(part != NULL);
425  if (!part->IsEmpty() && part->bounding_box().area() > 0) {
426  leader_and_ruling_grid_.InsertBBox(true, true, part);
427  } else {
428  delete part;
429  }
430 }
#define ASSERT_HOST(x)
Definition: errcode.h:84
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
#define NULL
Definition: host.h:144
void tesseract::TableFinder::InsertRulingPartition ( ColPartition part)
protected

Definition at line 431 of file tablefind.cpp.

431  {
432  leader_and_ruling_grid_.InsertBBox(true, true, part);
433 }
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
void tesseract::TableFinder::InsertTextPartition ( ColPartition part)
protected

Definition at line 407 of file tablefind.cpp.

407  {
408  ASSERT_HOST(part != NULL);
409  if (AllowTextPartition(*part)) {
410  clean_part_grid_.InsertBBox(true, true, part);
411  } else {
412  delete part;
413  }
414 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:502
#define ASSERT_HOST(x)
Definition: errcode.h:84
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
#define NULL
Definition: host.h:144
void tesseract::TableFinder::LocateTables ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback width_cb,
const FCOORD reskew 
)

Definition at line 269 of file tablefind.cpp.

272  {
273  // initialize spacing, neighbors, and columns
274  InitializePartitions(all_columns);
275 
276 #ifndef GRAPHICS_DISABLED
277  if (textord_show_tables) {
278  ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
284 
285  table_win = MakeWindow(100, 300, "Fragmented Text");
287  }
288 #endif // GRAPHICS_DISABLED
289 
290  // mark, filter, and smooth candidate table partitions
292 
293  // Make single-column blocks from good_columns_ partitions. col_segments are
294  // moved to a grid later which takes the ownership
295  ColSegment_LIST column_blocks;
296  GetColumnBlocks(all_columns, &column_blocks);
297  // Set the ratio of candidate table partitions in each column
298  SetColumnsType(&column_blocks);
299 
300  // Move column segments to col_seg_grid_
301  MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
302 
303  // Detect split in column layout that might have occurred due to the
304  // presence of a table. In such a case, merge the corresponding columns.
306 
307  // Group horizontally overlapping table partitions into table columns.
308  // table_columns created here get deleted at the end of this method.
309  ColSegment_LIST table_columns;
310  GetTableColumns(&table_columns);
311 
312  // Within each column, mark the range table regions occupy based on the
313  // table columns detected. table_regions are moved to a grid later which
314  // takes the ownership
315  ColSegment_LIST table_regions;
316  GetTableRegions(&table_columns, &table_regions);
317 
318 #ifndef GRAPHICS_DISABLED
320  ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions");
321  DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
322  DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
323  }
324 #endif // GRAPHICS_DISABLED
325 
326  // Merge table regions across columns for tables spanning multiple
327  // columns
328  MoveColSegmentsToGrid(&table_regions, &table_grid_);
330 
331  // Adjust table boundaries by including nearby horizontal lines and left
332  // out column headers
335 
337  // Remove false alarms consiting of a single column
339 
340 #ifndef GRAPHICS_DISABLED
341  if (textord_show_tables) {
342  ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations");
344  DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
345  table_grid_.DisplayBoxes(table_win);
346  }
347 #endif // GRAPHICS_DISABLED
348 
349  // Find table grid structure and reject tables that are malformed.
350  RecognizeTables();
352  RecognizeTables();
353 
354 #ifndef GRAPHICS_DISABLED
355  if (textord_show_tables) {
356  ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables");
359  table_grid_.DisplayBoxes(table_win);
360  }
361 #endif // GRAPHICS_DISABLED
362  } else {
363  // Remove false alarms consiting of a single column
364  // TODO(nbeato): verify this is a NOP after structured table rejection.
365  // Right now it isn't. If the recognize function is doing what it is
366  // supposed to do, this function is obsolete.
368 
369 #ifndef GRAPHICS_DISABLED
370  if (textord_show_tables) {
371  ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
374  table_grid_.DisplayBoxes(table_win);
375  }
376 #endif // GRAPHICS_DISABLED
377  }
378 
380  WriteToPix(reskew);
381 
382  // Merge all colpartitions in table regions to make them a single
383  // colpartition and revert types of isolated table cells not
384  // assigned to any table to their original types.
385  MakeTableBlocks(grid, all_columns, width_cb);
386 }
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:536
bool textord_tablefind_recognize_tables
Definition: tablefind.cpp:158
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
Definition: tablefind.cpp:1963
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
bool textord_dump_table_images
Definition: tablefind.cpp:151
void SetColumnsType(ColSegment_LIST *col_segments)
Definition: tablefind.cpp:1156
bool textord_show_tables
Definition: tablefind.cpp:152
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
Definition: tablefind.cpp:1189
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
Definition: tablefind.cpp:1884
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:616
bool textord_tablefind_show_mark
Definition: tablefind.cpp:154
void WriteToPix(const FCOORD &reskew)
Definition: tablefind.cpp:2011
void InitializePartitions(ColPartitionSet **all_columns)
Definition: tablefind.cpp:592
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:531
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
Definition: tablefind.cpp:1336
ColSegmentGrid table_grid_
Definition: tablefind.h:428
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
Definition: tablefind.cpp:2084
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1929
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
void GetTableColumns(ColSegment_LIST *table_columns)
Definition: tablefind.cpp:1286
void tesseract::TableFinder::MakeTableBlocks ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback width_cb 
)
protected

Definition at line 2084 of file tablefind.cpp.

2086  {
2087  // Since we have table blocks already, remove table tags from all
2088  // colpartitions
2089  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2090  gsearch(grid);
2091  gsearch.StartFullSearch();
2092  ColPartition* part = NULL;
2093 
2094  while ((part = gsearch.NextFullSearch()) != NULL) {
2095  if (part->type() == PT_TABLE) {
2096  part->clear_table_type();
2097  }
2098  }
2099  // Now make a single colpartition out of each table block and remove
2100  // all colpartitions contained within a table
2101  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
2102  table_search(&table_grid_);
2103  table_search.StartFullSearch();
2104  ColSegment* table;
2105  while ((table = table_search.NextFullSearch()) != NULL) {
2106  TBOX table_box = table->bounding_box();
2107  // Start a rect search on table_box
2108  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2109  rectsearch(grid);
2110  rectsearch.StartRectSearch(table_box);
2111  ColPartition* part;
2112  ColPartition* table_partition = NULL;
2113  while ((part = rectsearch.NextRectSearch()) != NULL) {
2114  // Do not consider image partitions
2115  if (!part->IsTextType())
2116  continue;
2117  TBOX part_box = part->bounding_box();
2118  // Include partition in the table if more than half of it
2119  // is covered by the table
2120  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2121  rectsearch.RemoveBBox();
2122  if (table_partition) {
2123  table_partition->Absorb(part, width_cb);
2124  } else {
2125  table_partition = part;
2126  }
2127  }
2128  }
2129  // Insert table colpartition back to part_grid_
2130  if (table_partition) {
2131  // To match the columns used when transforming to blocks, the new table
2132  // partition must have its first and last column set at the grid y that
2133  // corresponds to its bottom.
2134  const TBOX& table_box = table_partition->bounding_box();
2135  int grid_x, grid_y;
2136  grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2137  table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2138  table_partition->set_table_type();
2139  table_partition->set_blob_type(BRT_TEXT);
2140  table_partition->set_flow(BTFT_CHAIN);
2141  table_partition->SetBlobTypes();
2142  grid->InsertBBox(true, true, table_partition);
2143  }
2144  }
2145 }
const double kMinOverlapWithTable
Definition: tablefind.cpp:100
Definition: capi.h:78
inT16 left() const
Definition: rect.h:68
inT16 bottom() const
Definition: rect.h:61
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
Definition: rect.h:30
ColSegmentGrid table_grid_
Definition: tablefind.h:428
#define NULL
Definition: host.h:144
ScrollView * tesseract::TableFinder::MakeWindow ( int  x,
int  y,
const char *  window_name 
)
protected

Definition at line 531 of file tablefind.cpp.

531  {
532  return clean_part_grid_.MakeWindow(x, y, window_name);
533 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:592
void tesseract::TableFinder::MarkPartitionsUsingLocalInformation ( )
protected

Definition at line 840 of file tablefind.cpp.

840  {
841  // Iterate the ColPartitions in the grid.
842  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
843  gsearch(&clean_part_grid_);
844  gsearch.StartFullSearch();
845  ColPartition* part = NULL;
846  while ((part = gsearch.NextFullSearch()) != NULL) {
847  if (!part->IsTextType()) // Only consider text partitions
848  continue;
849  // Only consider partitions in dominant font size or smaller
850  if (part->median_size() > kMaxTableCellXheight * global_median_xheight_)
851  continue;
852  // Mark partitions with a large gap, or no significant gap as
853  // table partitions.
854  // Comments: It produces several false alarms at:
855  // - last line of a paragraph (fixed)
856  // - single word section headings
857  // - page headers and footers
858  // - numbered equations
859  // - line drawing regions
860  // TODO(faisal): detect and fix above-mentioned cases
861  if (HasWideOrNoInterWordGap(part) ||
862  HasLeaderAdjacent(*part)) {
863  part->set_table_type();
864  }
865  }
866 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
bool HasWideOrNoInterWordGap(ColPartition *part) const
Definition: tablefind.cpp:870
bool HasLeaderAdjacent(const ColPartition &part)
Definition: tablefind.cpp:959
#define NULL
Definition: host.h:144
const double kMaxTableCellXheight
Definition: tablefind.cpp:84
void tesseract::TableFinder::MarkTablePartitions ( )
protected

Definition at line 802 of file tablefind.cpp.

802  {
805  ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions");
809  }
812  ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions");
816  }
819  ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
823  }
826  ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions");
830  }
831 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
bool textord_show_tables
Definition: tablefind.cpp:152
bool textord_tablefind_show_mark
Definition: tablefind.cpp:154
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:531
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1929
void MarkPartitionsUsingLocalInformation()
Definition: tablefind.cpp:840
void tesseract::TableFinder::MoveColSegmentsToGrid ( ColSegment_LIST *  segments,
ColSegmentGrid col_seg_grid 
)
protected

Definition at line 1189 of file tablefind.cpp.

1190  {
1191  ColSegment_IT it(segments);
1192  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1193  ColSegment* seg = it.extract();
1194  col_seg_grid->InsertBBox(true, true, seg);
1195  }
1196 }
void tesseract::TableFinder::RecognizeTables ( )
protected

Definition at line 1830 of file tablefind.cpp.

1830  {
1831  ScrollView* table_win = NULL;
1832  if (textord_show_tables) {
1833  table_win = MakeWindow(0, 0, "Table Structure");
1836  // table_grid_.DisplayBoxes(table_win);
1837  }
1838 
1839 
1840  TableRecognizer recognizer;
1841  recognizer.Init();
1842  recognizer.set_line_grid(&leader_and_ruling_grid_);
1843  recognizer.set_text_grid(&fragmented_text_grid_);
1844  recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1845  recognizer.set_min_height(1.5 * gridheight());
1846  // Loop over all of the tables and try to fit them.
1847  // Store the good tables here.
1848  ColSegment_CLIST good_tables;
1849  ColSegment_C_IT good_it(&good_tables);
1850 
1852  gsearch.StartFullSearch();
1853  ColSegment* found_table = NULL;
1854  while ((found_table = gsearch.NextFullSearch()) != NULL) {
1855  gsearch.RemoveBBox();
1856 
1857  // The goal is to make the tables persistent in a list.
1858  // When that happens, this will move into the search loop.
1859  const TBOX& found_box = found_table->bounding_box();
1860  StructuredTable* table_structure = recognizer.RecognizeTable(found_box);
1861 
1862  // Process a table. Good tables are inserted into the grid again later on
1863  // We can't change boxes in the grid while it is running a search.
1864  if (table_structure != NULL) {
1865  if (textord_show_tables) {
1866  table_structure->Display(table_win, ScrollView::LIME_GREEN);
1867  }
1868  found_table->set_bounding_box(table_structure->bounding_box());
1869  delete table_structure;
1870  good_it.add_after_then_move(found_table);
1871  } else {
1872  delete found_table;
1873  }
1874  }
1875  // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1876 
1877  // At this point, the grid is empty. We can safely insert the good tables
1878  // back into grid.
1879  for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
1880  table_grid_.InsertBBox(true, true, good_it.extract());
1881 }
GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > ColSegmentGridSearch
Definition: tablefind.h:121
int gridheight() const
Definition: tablefind.cpp:397
bool textord_show_tables
Definition: tablefind.cpp:152
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:531
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
Definition: rect.h:30
ColSegmentGrid table_grid_
Definition: tablefind.h:428
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1929
#define NULL
Definition: host.h:144
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
void tesseract::TableFinder::set_global_median_blob_width ( int  width)
protected

Definition at line 772 of file tablefind.cpp.

772  {
774 }
void tesseract::TableFinder::set_global_median_ledding ( int  ledding)
protected

Definition at line 775 of file tablefind.cpp.

775  {
776  global_median_ledding_ = ledding;
777 }
void tesseract::TableFinder::set_global_median_xheight ( int  xheight)
protected

Definition at line 769 of file tablefind.cpp.

769  {
770  global_median_xheight_ = xheight;
771 }
void tesseract::TableFinder::set_left_to_right_language ( bool  order)

Definition at line 187 of file tablefind.cpp.

187  {
188  left_to_right_language_ = order;
189 }
void tesseract::TableFinder::set_resolution ( int  resolution)
inline

Definition at line 138 of file tablefind.h.

138  {
139  resolution_ = resolution;
140  }
void tesseract::TableFinder::SetColumnsType ( ColSegment_LIST *  col_segments)
protected

Definition at line 1156 of file tablefind.cpp.

1156  {
1157  ColSegment_IT it(column_blocks);
1158  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1159  ColSegment* seg = it.data();
1160  TBOX box = seg->bounding_box();
1161  int num_table_cells = 0;
1162  int num_text_cells = 0;
1163  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1164  rsearch(&clean_part_grid_);
1165  rsearch.SetUniqueMode(true);
1166  rsearch.StartRectSearch(box);
1167  ColPartition* part = NULL;
1168  while ((part = rsearch.NextRectSearch()) != NULL) {
1169  if (part->type() == PT_TABLE) {
1170  num_table_cells++;
1171  } else if (part->type() == PT_FLOWING_TEXT) {
1172  num_text_cells++;
1173  }
1174  }
1175  // If a column block has no text or table partition in it, it is not needed
1176  // for table detection.
1177  if (!num_table_cells && !num_text_cells) {
1178  delete it.extract();
1179  } else {
1180  seg->set_num_table_cells(num_table_cells);
1181  seg->set_num_text_cells(num_text_cells);
1182  // set column type based on the ratio of table to text cells
1183  seg->set_type();
1184  }
1185  }
1186 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
Definition: capi.h:78
Definition: rect.h:30
#define NULL
Definition: host.h:144
void tesseract::TableFinder::SetGlobalSpacings ( ColPartitionGrid grid)
protected

Definition at line 722 of file tablefind.cpp.

722  {
723  STATS xheight_stats(0, kMaxVerticalSpacing + 1);
724  STATS width_stats(0, kMaxBlobWidth + 1);
725  STATS ledding_stats(0, kMaxVerticalSpacing + 1);
726  // Iterate the ColPartitions in the grid.
727  ColPartitionGridSearch gsearch(grid);
728  gsearch.SetUniqueMode(true);
729  gsearch.StartFullSearch();
730  ColPartition* part = NULL;
731  while ((part = gsearch.NextFullSearch()) != NULL) {
732  // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
733  // ComputeLimits needs to get called somewhere outside of TableFinder
734  // to make sure the partitions are properly initialized.
735  // When this is called, SmoothPartitionPartners dies in an assert after
736  // table find runs. Alternative solution.
737  // part->ComputeLimits();
738  if (part->IsTextType()) {
739  // xheight_stats.add(part->median_size(), part->boxes_count());
740  // width_stats.add(part->median_width(), part->boxes_count());
741 
742  // This loop can be removed when above issues are fixed.
743  // Replace it with the 2 lines commented out above.
744  BLOBNBOX_C_IT it(part->boxes());
745  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
746  xheight_stats.add(it.data()->bounding_box().height(), 1);
747  width_stats.add(it.data()->bounding_box().width(), 1);
748  }
749 
750  ledding_stats.add(part->space_above(), 1);
751  ledding_stats.add(part->space_below(), 1);
752  }
753  }
754  // Set estimates based on median of statistics obtained
755  set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
756  set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
757  set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
758  #ifndef GRAPHICS_DISABLED
760  const char* kWindowName = "X-height (R), X-width (G), and ledding (B)";
761  ScrollView* stats_win = MakeWindow(500, 10, kWindowName);
762  xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
763  width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
764  ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
765  }
766  #endif // GRAPHICS_DISABLED
767 }
void set_global_median_ledding(int ledding)
Definition: tablefind.cpp:775
Definition: statistc.h:33
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
void set_global_median_xheight(int xheight)
Definition: tablefind.cpp:769
bool textord_tablefind_show_stats
Definition: tablefind.cpp:156
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:531
void set_global_median_blob_width(int width)
Definition: tablefind.cpp:772
const int kMaxVerticalSpacing
Definition: tablefind.cpp:41
#define NULL
Definition: host.h:144
const int kMaxBlobWidth
Definition: tablefind.cpp:43
void tesseract::TableFinder::SetPartitionSpacings ( ColPartitionGrid grid,
ColPartitionSet **  all_columns 
)
staticprotected

Definition at line 599 of file tablefind.cpp.

600  {
601  // Iterate the ColPartitions in the grid.
602  ColPartitionGridSearch gsearch(grid);
603  gsearch.StartFullSearch();
604  ColPartition* part = NULL;
605  while ((part = gsearch.NextFullSearch()) != NULL) {
606  ColPartitionSet* columns = all_columns[gsearch.GridY()];
607  TBOX box = part->bounding_box();
608  int y = part->MidY();
609  ColPartition* left_column = columns->ColumnContaining(box.left(), y);
610  ColPartition* right_column = columns->ColumnContaining(box.right(), y);
611  // set distance from left column as space to the left
612  if (left_column) {
613  int left_space = MAX(0, box.left() - left_column->LeftAtY(y));
614  part->set_space_to_left(left_space);
615  }
616  // set distance from right column as space to the right
617  if (right_column) {
618  int right_space = MAX(0, right_column->RightAtY(y) - box.right());
619  part->set_space_to_right(right_space);
620  }
621 
622  // Look for images that may be closer.
623  // NOTE: used to be part_grid_, might cause issues now
624  ColPartitionGridSearch hsearch(grid);
625  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
626  ColPartition* neighbor = NULL;
627  while ((neighbor = hsearch.NextSideSearch(true)) != NULL) {
628  if (neighbor->type() == PT_PULLOUT_IMAGE ||
629  neighbor->type() == PT_FLOWING_IMAGE ||
630  neighbor->type() == PT_HEADING_IMAGE) {
631  int right = neighbor->bounding_box().right();
632  if (right < box.left()) {
633  int space = MIN(box.left() - right, part->space_to_left());
634  part->set_space_to_left(space);
635  }
636  }
637  }
638  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
639  neighbor = NULL;
640  while ((neighbor = hsearch.NextSideSearch(false)) != NULL) {
641  if (neighbor->type() == PT_PULLOUT_IMAGE ||
642  neighbor->type() == PT_FLOWING_IMAGE ||
643  neighbor->type() == PT_HEADING_IMAGE) {
644  int left = neighbor->bounding_box().left();
645  if (left > box.right()) {
646  int space = MIN(left - box.right(), part->space_to_right());
647  part->set_space_to_right(space);
648  }
649  }
650  }
651 
652  ColPartition* upper_part = part->SingletonPartner(true);
653  if (upper_part) {
654  int space = MAX(0, upper_part->bounding_box().bottom() -
655  part->bounding_box().bottom());
656  part->set_space_above(space);
657  } else {
658  // TODO(nbeato): What constitutes a good value?
659  // 0 is the default value when not set, explicitly noting it needs to
660  // be something else.
661  part->set_space_above(MAX_INT32);
662  }
663 
664  ColPartition* lower_part = part->SingletonPartner(false);
665  if (lower_part) {
666  int space = MAX(0, part->bounding_box().bottom() -
667  lower_part->bounding_box().bottom());
668  part->set_space_below(space);
669  } else {
670  // TODO(nbeato): What constitutes a good value?
671  // 0 is the default value when not set, explicitly noting it needs to
672  // be something else.
673  part->set_space_below(MAX_INT32);
674  }
675  }
676 }
#define MAX(x, y)
Definition: ndminx.h:24
#define MIN(x, y)
Definition: ndminx.h:28
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
inT16 right() const
Definition: rect.h:75
inT16 left() const
Definition: rect.h:68
#define MAX_INT32
Definition: host.h:120
inT16 bottom() const
Definition: rect.h:61
Definition: rect.h:30
#define NULL
Definition: host.h:144
inT16 top() const
Definition: rect.h:54
void tesseract::TableFinder::SetVerticalSpacing ( ColPartition part)
protected

Definition at line 679 of file tablefind.cpp.

679  {
680  TBOX box = part->bounding_box();
681  int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y());
682  int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y());
683  box.set_top(top_range);
684  box.set_bottom(bottom_range);
685 
686  TBOX part_box = part->bounding_box();
687  // Start a rect search
688  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
689  rectsearch(&clean_part_grid_);
690  rectsearch.StartRectSearch(box);
691  ColPartition* neighbor;
692  int min_space_above = kMaxVerticalSpacing;
693  int min_space_below = kMaxVerticalSpacing;
694  ColPartition* above_neighbor = NULL;
695  ColPartition* below_neighbor = NULL;
696  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
697  if (neighbor == part)
698  continue;
699  TBOX neighbor_box = neighbor->bounding_box();
700  if (neighbor_box.major_x_overlap(part_box)) {
701  int gap = abs(part->median_bottom() - neighbor->median_bottom());
702  // If neighbor is below current partition
703  if (neighbor_box.top() < part_box.bottom() &&
704  gap < min_space_below) {
705  min_space_below = gap;
706  below_neighbor = neighbor;
707  } // If neighbor is above current partition
708  else if (part_box.top() < neighbor_box.bottom() &&
709  gap < min_space_above) {
710  min_space_above = gap;
711  above_neighbor = neighbor;
712  }
713  }
714  }
715  part->set_space_above(min_space_above);
716  part->set_space_below(min_space_below);
717  part->set_nearest_neighbor_above(above_neighbor);
718  part->set_nearest_neighbor_below(below_neighbor);
719 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define MAX(x, y)
Definition: ndminx.h:24
#define MIN(x, y)
Definition: ndminx.h:28
const ICOORD & bleft() const
Definition: tablefind.cpp:400
const ICOORD & tright() const
Definition: tablefind.cpp:403
void set_bottom(int y)
Definition: rect.h:64
inT16 y() const
access_function
Definition: points.h:56
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
inT16 bottom() const
Definition: rect.h:61
const int kMaxVerticalSpacing
Definition: tablefind.cpp:41
Definition: rect.h:30
#define NULL
Definition: host.h:144
void set_top(int y)
Definition: rect.h:57
inT16 top() const
Definition: rect.h:54
void tesseract::TableFinder::SmoothTablePartitionRuns ( )
protected

Definition at line 1121 of file tablefind.cpp.

1121  {
1122  // Iterate the ColPartitions in the grid.
1124  gsearch.StartFullSearch();
1125  ColPartition* part = NULL;
1126  while ((part = gsearch.NextFullSearch()) != NULL) {
1127  if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN)
1128  continue; // Consider only text partitions
1129  ColPartition* upper_part = part->nearest_neighbor_above();
1130  ColPartition* lower_part = part->nearest_neighbor_below();
1131  if (!upper_part || !lower_part)
1132  continue;
1133  if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
1134  part->set_table_type();
1135  }
1136 
1137  // Pass 2, do the opposite. If both the upper and lower neighbors
1138  // exist and are not tables, this probably shouldn't be a table.
1139  gsearch.StartFullSearch();
1140  part = NULL;
1141  while ((part = gsearch.NextFullSearch()) != NULL) {
1142  if (part->type() != PT_TABLE)
1143  continue; // Consider only text partitions
1144  ColPartition* upper_part = part->nearest_neighbor_above();
1145  ColPartition* lower_part = part->nearest_neighbor_below();
1146 
1147  // table can't be by itself
1148  if ((upper_part && upper_part->type() != PT_TABLE) &&
1149  (lower_part && lower_part->type() != PT_TABLE)) {
1150  part->clear_table_type();
1151  }
1152  }
1153 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:913
Definition: capi.h:78
#define NULL
Definition: host.h:144
void tesseract::TableFinder::SplitAndInsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 449 of file tablefind.cpp.

449  {
450  ASSERT_HOST(part != NULL);
451  // Bye bye empty partitions!
452  if (part->boxes()->empty()) {
453  delete part;
454  return;
455  }
456 
457  // The AllowBlob function prevents this.
458  ASSERT_HOST(part->median_width() > 0);
459  const double kThreshold = part->median_width() * kSplitPartitionSize;
460 
461  ColPartition* right_part = part;
462  bool found_split = true;
463  while (found_split) {
464  found_split = false;
465  BLOBNBOX_C_IT box_it(right_part->boxes());
466  // Blobs are sorted left side first. If blobs overlap,
467  // the previous blob may have a "more right" right side.
468  // Account for this by always keeping the largest "right"
469  // so far.
470  int previous_right = MIN_INT32;
471 
472  // Look for the next split in the partition.
473  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
474  const TBOX& box = box_it.data()->bounding_box();
475  if (previous_right != MIN_INT32 &&
476  box.left() - previous_right > kThreshold) {
477  // We have a split position. Split the partition in two pieces.
478  // Insert the left piece in the grid and keep processing the right.
479  int mid_x = (box.left() + previous_right) / 2;
480  ColPartition* left_part = right_part;
481  right_part = left_part->SplitAt(mid_x);
482 
484  found_split = true;
485  break;
486  }
487 
488  // The right side of the previous blobs.
489  previous_right = MAX(previous_right, box.right());
490  }
491  }
492  // When a split is not found, the right part is minimized
493  // as much as possible, so process it.
494  InsertFragmentedTextPartition(right_part);
495 }
#define MAX(x, y)
Definition: ndminx.h:24
const double kSplitPartitionSize
Definition: tablefind.cpp:47
inT16 right() const
Definition: rect.h:75
#define MIN_INT32
Definition: host.h:128
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
void InsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:415
Definition: rect.h:30
#define NULL
Definition: host.h:144
const ICOORD & tesseract::TableFinder::tright ( ) const
protected

Definition at line 403 of file tablefind.cpp.

403  {
404  return clean_part_grid_.tright();
405 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
const ICOORD & tright() const
Definition: bbgrid.h:75
void tesseract::TableFinder::WriteToPix ( const FCOORD reskew)
protected

Definition at line 2011 of file tablefind.cpp.

2011  {
2012  // Input file must be named test1.tif
2013  PIX* pix = pixRead("test1.tif");
2014  if (!pix) {
2015  tprintf("Input file test1.tif not found.\n");
2016  return;
2017  }
2018  int img_height = pixGetHeight(pix);
2019  int img_width = pixGetWidth(pix);
2020  // Maximum number of text or table partitions
2021  int num_boxes = 10;
2022  BOXA* text_box_array = boxaCreate(num_boxes);
2023  BOXA* table_box_array = boxaCreate(num_boxes);
2024  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2025  gsearch(&clean_part_grid_);
2026  gsearch.StartFullSearch();
2027  ColPartition* part;
2028  // load colpartitions into text_box_array and table_box_array
2029  while ((part = gsearch.NextFullSearch()) != NULL) {
2030  TBOX box = part->bounding_box();
2031  box.rotate_large(reskew);
2032  BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
2033  box.right() - box.left(),
2034  box.top() - box.bottom());
2035  if (part->type() == PT_TABLE)
2036  boxaAddBox(table_box_array, lept_box, L_INSERT);
2037  else
2038  boxaAddBox(text_box_array, lept_box, L_INSERT);
2039  }
2040  // draw colpartitions on the output image
2041  PIX* out = pixDrawBoxa(pix, text_box_array, 3, 0xff000000);
2042  out = pixDrawBoxa(out, table_box_array, 3, 0x0000ff00);
2043 
2044  BOXA* table_array = boxaCreate(num_boxes);
2045  // text file containing detected table bounding boxes
2046  FILE* fptr = fopen("tess-table.txt", "wb");
2047  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
2048  table_search(&table_grid_);
2049  table_search.StartFullSearch();
2050  ColSegment* table;
2051  // load table boxes to table_array and write them to text file as well
2052  while ((table = table_search.NextFullSearch()) != NULL) {
2053  TBOX box = table->bounding_box();
2054  box.rotate_large(reskew);
2055  // Since deskewing introduces negative coordinates, reskewing
2056  // might not completely recover from that since both steps enlarge
2057  // the actual box. Hence a box that undergoes deskewing/reskewing
2058  // may go out of image boundaries. Crop a table box if needed to
2059  // contain it inside the image dimensions.
2060  box = box.intersection(TBOX(0, 0, img_width - 1, img_height - 1));
2061  BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
2062  box.right() - box.left(),
2063  box.top() - box.bottom());
2064  boxaAddBox(table_array, lept_box, L_INSERT);
2065  fprintf(fptr, "%d %d %d %d TABLE\n", box.left(),
2066  img_height - box.top(), box.right(), img_height - box.bottom());
2067  }
2068  fclose(fptr);
2069  // paint table boxes on the debug image
2070  out = pixDrawBoxa(out, table_array, 5, 0x7fff0000);
2071 
2072  pixWrite("out.png", out, IFF_PNG);
2073  // memory cleanup
2074  boxaDestroy(&text_box_array);
2075  boxaDestroy(&table_box_array);
2076  boxaDestroy(&table_array);
2077  pixDestroy(&pix);
2078  pixDestroy(&out);
2079 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define tprintf(...)
Definition: tprintf.h:31
inT16 right() const
Definition: rect.h:75
Definition: capi.h:78
void rotate_large(const FCOORD &vec)
Definition: rect.cpp:72
inT16 left() const
Definition: rect.h:68
inT16 bottom() const
Definition: rect.h:61
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
Definition: rect.h:30
ColSegmentGrid table_grid_
Definition: tablefind.h:428
#define NULL
Definition: host.h:144
inT16 top() const
Definition: rect.h:54

Member Data Documentation

ColPartitionGrid tesseract::TableFinder::clean_part_grid_
protected

Definition at line 418 of file tablefind.h.

ColSegmentGrid tesseract::TableFinder::col_seg_grid_
protected

Definition at line 426 of file tablefind.h.

ColPartitionGrid tesseract::TableFinder::fragmented_text_grid_
protected

Definition at line 424 of file tablefind.h.

int tesseract::TableFinder::global_median_blob_width_
protected

Definition at line 412 of file tablefind.h.

int tesseract::TableFinder::global_median_ledding_
protected

Definition at line 414 of file tablefind.h.

int tesseract::TableFinder::global_median_xheight_
protected

Definition at line 410 of file tablefind.h.

ColPartitionGrid tesseract::TableFinder::leader_and_ruling_grid_
protected

Definition at line 420 of file tablefind.h.

bool tesseract::TableFinder::left_to_right_language_
protected

Definition at line 430 of file tablefind.h.

int tesseract::TableFinder::resolution_
protected

Definition at line 408 of file tablefind.h.

ColSegmentGrid tesseract::TableFinder::table_grid_
protected

Definition at line 428 of file tablefind.h.


The documentation for this class was generated from the following files: