tesseract  4.0.0-1-g2a2b
tesseract::TableFinder Class Reference

#include <tablefind.h>

Public Member Functions

 TableFinder ()
 
 ~TableFinder ()
 
void set_resolution (int resolution)
 
void set_left_to_right_language (bool order)
 
void Init (int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
 
void InsertCleanPartitions (ColPartitionGrid *grid, TO_BLOCK *block)
 
void LocateTables (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb, const FCOORD &reskew)
 

Protected Member Functions

int gridsize () const
 
int gridwidth () const
 
int gridheight () const
 
const ICOORDbleft () const
 
const ICOORDtright () const
 
ScrollViewMakeWindow (int x, int y, const char *window_name)
 
void InsertTextPartition (ColPartition *part)
 
void InsertFragmentedTextPartition (ColPartition *part)
 
void InsertLeaderPartition (ColPartition *part)
 
void InsertRulingPartition (ColPartition *part)
 
void InsertImagePartition (ColPartition *part)
 
void SplitAndInsertFragmentedTextPartition (ColPartition *part)
 
bool AllowTextPartition (const ColPartition &part) const
 
bool AllowBlob (const BLOBNBOX &blob) const
 
void MoveColSegmentsToGrid (ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
 
void InitializePartitions (ColPartitionSet **all_columns)
 
void SetVerticalSpacing (ColPartition *part)
 
void SetGlobalSpacings (ColPartitionGrid *grid)
 
void set_global_median_xheight (int xheight)
 
void set_global_median_blob_width (int width)
 
void set_global_median_ledding (int ledding)
 
void FindNeighbors ()
 
void MarkTablePartitions ()
 
void MarkPartitionsUsingLocalInformation ()
 
bool HasWideOrNoInterWordGap (ColPartition *part) const
 
bool HasLeaderAdjacent (const ColPartition &part)
 
void FilterFalseAlarms ()
 
void FilterParagraphEndings ()
 
void FilterHeaderAndFooter ()
 
void SmoothTablePartitionRuns ()
 
void GetColumnBlocks (ColPartitionSet **columns, ColSegment_LIST *col_segments)
 
void GroupColumnBlocks (ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
 
bool ConsecutiveBoxes (const TBOX &b1, const TBOX &b2)
 
void SetColumnsType (ColSegment_LIST *col_segments)
 
void GridMergeColumnBlocks ()
 
void GetTableColumns (ColSegment_LIST *table_columns)
 
void GetTableRegions (ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
 
void GridMergeTableRegions ()
 
bool BelongToOneTable (const TBOX &box1, const TBOX &box2)
 
void AdjustTableBoundaries ()
 
void GrowTableBox (const TBOX &table_box, TBOX *result_box)
 
void GrowTableToIncludePartials (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
void GrowTableToIncludeLines (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
bool HLineBelongsToTable (const ColPartition &part, const TBOX &table_box)
 
void IncludeLeftOutColumnHeaders (TBOX *table_box)
 
void DeleteSingleColumnTables ()
 
bool GapInXProjection (int *xprojection, int length)
 
void RecognizeTables ()
 
void DisplayColSegments (ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColPartitionConnections (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColSegmentGrid (ScrollView *win, ColSegmentGrid *grid, ScrollView::Color color)
 
void MakeTableBlocks (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
 

Static Protected Member Functions

static void SetPartitionSpacings (ColPartitionGrid *grid, ColPartitionSet **all_columns)
 

Protected Attributes

int resolution_
 
int global_median_xheight_
 
int global_median_blob_width_
 
int global_median_ledding_
 
ColPartitionGrid clean_part_grid_
 
ColPartitionGrid leader_and_ruling_grid_
 
ColPartitionGrid fragmented_text_grid_
 
ColSegmentGrid col_seg_grid_
 
ColSegmentGrid table_grid_
 
bool left_to_right_language_
 

Detailed Description

Definition at line 131 of file tablefind.h.

Constructor & Destructor Documentation

◆ TableFinder()

tesseract::TableFinder::TableFinder ( )

Definition at line 160 of file tablefind.cpp.

◆ ~TableFinder()

tesseract::TableFinder::~TableFinder ( )

Definition at line 168 of file tablefind.cpp.

168  {
169  // ColPartitions and ColSegments created by this class for storage in grids
170  // need to be deleted explicitly.
171  clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
172  leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
173  fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
174  col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
175  table_grid_.ClearGridData(&DeleteObject<ColSegment>);
176 }
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:421
ColSegmentGrid table_grid_
Definition: tablefind.h:423
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
void ClearGridData(void(*free_method)(BBC *))
Definition: bbgrid.h:466
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419

Member Function Documentation

◆ AdjustTableBoundaries()

void tesseract::TableFinder::AdjustTableBoundaries ( )
protected

Definition at line 1487 of file tablefind.cpp.

1487  {
1488  // Iterate the table regions in the grid
1489  ColSegment_CLIST adjusted_tables;
1490  ColSegment_C_IT it(&adjusted_tables);
1492  gsearch.StartFullSearch();
1493  ColSegment* table = nullptr;
1494  while ((table = gsearch.NextFullSearch()) != nullptr) {
1495  const TBOX& table_box = table->bounding_box();
1496  TBOX grown_box = table_box;
1497  GrowTableBox(table_box, &grown_box);
1498  // To prevent a table from expanding again, do not insert the
1499  // modified box back to the grid. Instead move it to a list and
1500  // and remove it from the grid. The list is moved later back to the grid.
1501  if (!grown_box.null_box()) {
1502  ColSegment* col = new ColSegment();
1503  col->InsertBox(grown_box);
1504  it.add_after_then_move(col);
1505  }
1506  gsearch.RemoveBBox();
1507  delete table;
1508  }
1509  // clear table grid to move final tables in it
1510  // TODO(nbeato): table_grid_ should already be empty. The above loop
1511  // removed everything. Maybe just assert it is empty?
1512  table_grid_.Clear();
1513  it.move_to_first();
1514  // move back final tables to table_grid_
1515  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1516  ColSegment* seg = it.extract();
1517  table_grid_.InsertBBox(true, true, seg);
1518  }
1519 }
GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > ColSegmentGridSearch
Definition: tablefind.h:121
bool null_box() const
Definition: rect.h:50
Definition: rect.h:34
void Clear()
Definition: bbgrid.h:457
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
Definition: tablefind.cpp:1521
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:488
ColSegmentGrid table_grid_
Definition: tablefind.h:423

◆ AllowBlob()

bool tesseract::TableFinder::AllowBlob ( const BLOBNBOX blob) const
protected

Definition at line 503 of file tablefind.cpp.

503  {
504  const TBOX& box = blob.bounding_box();
505  const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
506  const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
507  const int median_area = global_median_xheight_ * global_median_blob_width_;
508  const double kAreaRequired = median_area * kAllowBlobArea;
509  // Keep comparisons strictly greater to disallow 0!
510  return box.height() > kHeightRequired &&
511  box.width() > kWidthRequired &&
512  box.area() > kAreaRequired;
513 }
Definition: rect.h:34
const double kAllowBlobArea
Definition: tablefind.cpp:58
int16_t width() const
Definition: rect.h:115
const double kAllowBlobHeight
Definition: tablefind.cpp:56
int32_t area() const
Definition: rect.h:122
const TBOX & bounding_box() const
Definition: blobbox.h:231
int16_t height() const
Definition: rect.h:108
const double kAllowBlobWidth
Definition: tablefind.cpp:57

◆ AllowTextPartition()

bool tesseract::TableFinder::AllowTextPartition ( const ColPartition part) const
protected

Definition at line 490 of file tablefind.cpp.

490  {
491  const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
492  const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
493  const int median_area = global_median_xheight_ * global_median_blob_width_;
494  const double kAreaPerBlobRequired = median_area * kAllowTextArea;
495  // Keep comparisons strictly greater to disallow 0!
496  return part.median_height() > kHeightRequired &&
497  part.median_width() > kWidthRequired &&
498  part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
499 }
const double kAllowTextArea
Definition: tablefind.cpp:51
const double kAllowTextWidth
Definition: tablefind.cpp:50
const double kAllowTextHeight
Definition: tablefind.cpp:49

◆ BelongToOneTable()

bool tesseract::TableFinder::BelongToOneTable ( const TBOX box1,
const TBOX box2 
)
protected

Definition at line 1445 of file tablefind.cpp.

1445  {
1446  // Check the obvious case. Most likely not true because overlapping boxes
1447  // should already be merged, but seems like a good thing to do in case things
1448  // change.
1449  if (box1.overlap(box2))
1450  return true;
1451  // Check for ColPartitions spanning both table regions
1452  TBOX bbox = box1.bounding_union(box2);
1453  // Start a rect search on bbox
1454  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1455  rectsearch(&clean_part_grid_);
1456  rectsearch.StartRectSearch(bbox);
1457  ColPartition* part = nullptr;
1458  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1459  const TBOX& part_box = part->bounding_box();
1460  // return true if a colpartition spanning both table regions is found
1461  if (part_box.overlap(box1) && part_box.overlap(box2) &&
1462  !part->IsImageType())
1463  return true;
1464  }
1465  return false;
1466 }
Definition: rect.h:34
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
bool overlap(const TBOX &box) const
Definition: rect.h:355
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129

◆ bleft()

const ICOORD & tesseract::TableFinder::bleft ( ) const
protected

Definition at line 388 of file tablefind.cpp.

388  {
389  return clean_part_grid_.bleft();
390 }
const ICOORD & bleft() const
Definition: bbgrid.h:73
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ ConsecutiveBoxes()

bool tesseract::TableFinder::ConsecutiveBoxes ( const TBOX b1,
const TBOX b2 
)
protected

Definition at line 569 of file tablefind.cpp.

569  {
570  int x_margin = 20;
571  int y_margin = 5;
572  return (abs(b1.left() - b2.left()) < x_margin) &&
573  (abs(b1.right() - b2.right()) < x_margin) &&
574  (abs(b1.top()-b2.bottom()) < y_margin ||
575  abs(b2.top()-b1.bottom()) < y_margin);
576 }
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65

◆ DeleteSingleColumnTables()

void tesseract::TableFinder::DeleteSingleColumnTables ( )
protected

Definition at line 1704 of file tablefind.cpp.

1704  {
1705  int page_width = tright().x() - bleft().x();
1706  ASSERT_HOST(page_width > 0);
1707  // create an integer array to hold projection on x-axis
1708  int* table_xprojection = new int[page_width];
1709  // Iterate through all tables in the table grid
1710  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1711  table_search(&table_grid_);
1712  table_search.StartFullSearch();
1713  ColSegment* table;
1714  while ((table = table_search.NextFullSearch()) != nullptr) {
1715  TBOX table_box = table->bounding_box();
1716  // reset the projection array
1717  for (int i = 0; i < page_width; i++) {
1718  table_xprojection[i] = 0;
1719  }
1720  // Start a rect search on table_box
1721  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1722  rectsearch(&clean_part_grid_);
1723  rectsearch.SetUniqueMode(true);
1724  rectsearch.StartRectSearch(table_box);
1725  ColPartition* part;
1726  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1727  if (!part->IsTextType())
1728  continue; // Do not consider non-text partitions
1729  if (part->flow() == BTFT_LEADER)
1730  continue; // Assume leaders are in tables
1731  TBOX part_box = part->bounding_box();
1732  // Do not consider partitions partially covered by the table
1733  if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
1734  continue;
1735  BLOBNBOX_CLIST* part_boxes = part->boxes();
1736  BLOBNBOX_C_IT pit(part_boxes);
1737 
1738  // Make sure overlapping blobs don't artificially inflate the number
1739  // of rows in the table. This happens frequently with things such as
1740  // decimals and split characters. Do this by assuming the column
1741  // partition is sorted mostly left to right and just clip
1742  // bounding boxes by the previous box's extent.
1743  int next_position_to_write = 0;
1744 
1745  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1746  BLOBNBOX *pblob = pit.data();
1747  // ignore blob height for the purpose of projection since we
1748  // are only interested in finding valleys
1749  int xstart = pblob->bounding_box().left();
1750  int xend = pblob->bounding_box().right();
1751 
1752  xstart = std::max(xstart, next_position_to_write);
1753  for (int i = xstart; i < xend; i++)
1754  table_xprojection[i - bleft().x()]++;
1755  next_position_to_write = xend;
1756  }
1757  }
1758  // Find largest valley between two reasonable peaks in the table
1759  if (!GapInXProjection(table_xprojection, page_width)) {
1760  table_search.RemoveBBox();
1761  delete table;
1762  }
1763  }
1764  delete[] table_xprojection;
1765 }
const ICOORD & bleft() const
Definition: tablefind.cpp:388
Definition: rect.h:34
bool GapInXProjection(int *xprojection, int length)
Definition: tablefind.cpp:1769
const double kMinOverlapWithTable
Definition: tablefind.cpp:97
double overlap_fraction(const TBOX &box) const
Definition: rect.h:388
const ICOORD & tright() const
Definition: tablefind.cpp:391
int16_t left() const
Definition: rect.h:72
int16_t x() const
access function
Definition: points.h:53
ColSegmentGrid table_grid_
Definition: tablefind.h:423
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
const TBOX & bounding_box() const
Definition: blobbox.h:231
int16_t right() const
Definition: rect.h:79
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ DisplayColPartitionConnections()

void tesseract::TableFinder::DisplayColPartitionConnections ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1951 of file tablefind.cpp.

1954  {
1955 #ifndef GRAPHICS_DISABLED
1956  // Iterate the ColPartitions in the grid.
1957  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1958  gsearch(grid);
1959  gsearch.StartFullSearch();
1960  ColPartition* part = nullptr;
1961  while ((part = gsearch.NextFullSearch()) != nullptr) {
1962  const TBOX& box = part->bounding_box();
1963  int left_x = box.left();
1964  int right_x = box.right();
1965  int top_y = box.top();
1966  int bottom_y = box.bottom();
1967 
1968  ColPartition* upper_part = part->nearest_neighbor_above();
1969  if (upper_part) {
1970  const TBOX& upper_box = upper_part->bounding_box();
1971  int mid_x = (left_x + right_x) / 2;
1972  int mid_y = (top_y + bottom_y) / 2;
1973  int other_x = (upper_box.left() + upper_box.right()) / 2;
1974  int other_y = (upper_box.top() + upper_box.bottom()) / 2;
1975  win->Brush(ScrollView::NONE);
1976  win->Pen(color);
1977  win->Line(mid_x, mid_y, other_x, other_y);
1978  }
1979  ColPartition* lower_part = part->nearest_neighbor_below();
1980  if (lower_part) {
1981  const TBOX& lower_box = lower_part->bounding_box();
1982  int mid_x = (left_x + right_x) / 2;
1983  int mid_y = (top_y + bottom_y) / 2;
1984  int other_x = (lower_box.left() + lower_box.right()) / 2;
1985  int other_y = (lower_box.top() + lower_box.bottom()) / 2;
1986  win->Brush(ScrollView::NONE);
1987  win->Pen(color);
1988  win->Line(mid_x, mid_y, other_x, other_y);
1989  }
1990  }
1991  win->UpdateWindow();
1992 #endif
1993 }
Definition: rect.h:34
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
void UpdateWindow()
Definition: scrollview.cpp:706
int16_t right() const
Definition: rect.h:79
void Pen(Color color)
Definition: scrollview.cpp:722
int16_t bottom() const
Definition: rect.h:65
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:534
void Brush(Color color)
Definition: scrollview.cpp:728

◆ DisplayColPartitions() [1/2]

void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  text_color,
ScrollView::Color  table_color 
)
protected

Definition at line 1917 of file tablefind.cpp.

1920  {
1921 #ifndef GRAPHICS_DISABLED
1922  ScrollView::Color color = default_color;
1923  // Iterate the ColPartitions in the grid.
1924  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1925  gsearch(grid);
1926  gsearch.StartFullSearch();
1927  ColPartition* part = nullptr;
1928  while ((part = gsearch.NextFullSearch()) != nullptr) {
1929  color = default_color;
1930  if (part->type() == PT_TABLE)
1931  color = table_color;
1932 
1933  const TBOX& box = part->bounding_box();
1934  int left_x = box.left();
1935  int right_x = box.right();
1936  int top_y = box.top();
1937  int bottom_y = box.bottom();
1938  win->Brush(ScrollView::NONE);
1939  win->Pen(color);
1940  win->Rectangle(left_x, bottom_y, right_x, top_y);
1941  }
1942  win->UpdateWindow();
1943 #endif
1944 }
Definition: rect.h:34
Definition: capi.h:100
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
void UpdateWindow()
Definition: scrollview.cpp:706
int16_t right() const
Definition: rect.h:79
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:602
void Pen(Color color)
Definition: scrollview.cpp:722
int16_t bottom() const
Definition: rect.h:65
void Brush(Color color)
Definition: scrollview.cpp:728

◆ DisplayColPartitions() [2/2]

void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1945 of file tablefind.cpp.

1947  {
1948  DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1949 }
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1917

◆ DisplayColSegmentGrid()

void tesseract::TableFinder::DisplayColSegmentGrid ( ScrollView win,
ColSegmentGrid grid,
ScrollView::Color  color 
)
protected

Definition at line 1892 of file tablefind.cpp.

1893  {
1894 #ifndef GRAPHICS_DISABLED
1895  // Iterate the ColPartitions in the grid.
1896  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1897  gsearch(grid);
1898  gsearch.StartFullSearch();
1899  ColSegment* seg = nullptr;
1900  while ((seg = gsearch.NextFullSearch()) != nullptr) {
1901  const TBOX& box = seg->bounding_box();
1902  int left_x = box.left();
1903  int right_x = box.right();
1904  int top_y = box.top();
1905  int bottom_y = box.bottom();
1906  win->Brush(ScrollView::NONE);
1907  win->Pen(color);
1908  win->Rectangle(left_x, bottom_y, right_x, top_y);
1909  }
1910  win->UpdateWindow();
1911 #endif
1912 }
Definition: rect.h:34
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
void UpdateWindow()
Definition: scrollview.cpp:706
int16_t right() const
Definition: rect.h:79
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:602
void Pen(Color color)
Definition: scrollview.cpp:722
int16_t bottom() const
Definition: rect.h:65
void Brush(Color color)
Definition: scrollview.cpp:728

◆ DisplayColSegments()

void tesseract::TableFinder::DisplayColSegments ( ScrollView win,
ColSegment_LIST *  cols,
ScrollView::Color  color 
)
protected

Definition at line 1872 of file tablefind.cpp.

1874  {
1875 #ifndef GRAPHICS_DISABLED
1876  win->Pen(color);
1877  win->Brush(ScrollView::NONE);
1878  ColSegment_IT it(segments);
1879  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1880  ColSegment* col = it.data();
1881  const TBOX& box = col->bounding_box();
1882  int left_x = box.left();
1883  int right_x = box.right();
1884  int top_y = box.top();
1885  int bottom_y = box.bottom();
1886  win->Rectangle(left_x, bottom_y, right_x, top_y);
1887  }
1888  win->UpdateWindow();
1889 #endif
1890 }
Definition: rect.h:34
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
void UpdateWindow()
Definition: scrollview.cpp:706
int16_t right() const
Definition: rect.h:79
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:602
void Pen(Color color)
Definition: scrollview.cpp:722
int16_t bottom() const
Definition: rect.h:65
void Brush(Color color)
Definition: scrollview.cpp:728

◆ FilterFalseAlarms()

void tesseract::TableFinder::FilterFalseAlarms ( )
protected

Definition at line 989 of file tablefind.cpp.

989  {
992  // TODO(nbeato): Fully justified text as non-table?
993 }

◆ FilterHeaderAndFooter()

void tesseract::TableFinder::FilterHeaderAndFooter ( )
protected

Definition at line 1075 of file tablefind.cpp.

1075  {
1076  // Consider top-most text colpartition as header and bottom most as footer
1077  ColPartition* header = nullptr;
1078  ColPartition* footer = nullptr;
1079  int max_top = INT32_MIN;
1080  int min_bottom = INT32_MAX;
1082  gsearch.StartFullSearch();
1083  ColPartition* part = nullptr;
1084  while ((part = gsearch.NextFullSearch()) != nullptr) {
1085  if (!part->IsTextType())
1086  continue; // Consider only text partitions
1087  int top = part->bounding_box().top();
1088  int bottom = part->bounding_box().bottom();
1089  if (top > max_top) {
1090  max_top = top;
1091  header = part;
1092  }
1093  if (bottom < min_bottom) {
1094  min_bottom = bottom;
1095  footer = part;
1096  }
1097  }
1098  if (header)
1099  header->clear_table_type();
1100  if (footer)
1101  footer->clear_table_type();
1102 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ FilterParagraphEndings()

void tesseract::TableFinder::FilterParagraphEndings ( )
protected

Definition at line 995 of file tablefind.cpp.

995  {
996  // Detect last line of paragraph
997  // Iterate the ColPartitions in the grid.
999  gsearch.StartFullSearch();
1000  ColPartition* part = nullptr;
1001  while ((part = gsearch.NextFullSearch()) != nullptr) {
1002  if (part->type() != PT_TABLE)
1003  continue; // Consider only table partitions
1004 
1005  // Paragraph ending should have flowing text above it.
1006  ColPartition* upper_part = part->nearest_neighbor_above();
1007  if (!upper_part)
1008  continue;
1009  if (upper_part->type() != PT_FLOWING_TEXT)
1010  continue;
1011  if (upper_part->bounding_box().width() <
1012  2 * part->bounding_box().width())
1013  continue;
1014  // Check if its the last line of a paragraph.
1015  // In most cases, a paragraph ending should be left-aligned to text line
1016  // above it. Sometimes, it could be a 2 line paragraph, in which case
1017  // the line above it is indented.
1018  // To account for that, check if the partition center is to
1019  // the left of the one above it.
1020  int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1021  int upper_mid = (upper_part->bounding_box().left() +
1022  upper_part->bounding_box().right()) / 2;
1023  int current_spacing = 0; // spacing of the current line to margin
1024  int upper_spacing = 0; // spacing of the previous line to the margin
1026  // Left to right languages, use mid - left to figure out the distance
1027  // the middle is from the left margin.
1028  int left = std::min(part->bounding_box().left(),
1029  upper_part->bounding_box().left());
1030  current_spacing = mid - left;
1031  upper_spacing = upper_mid - left;
1032  } else {
1033  // Right to left languages, use right - mid to figure out the distance
1034  // the middle is from the right margin.
1035  int right = std::max(part->bounding_box().right(),
1036  upper_part->bounding_box().right());
1037  current_spacing = right - mid;
1038  upper_spacing = right - upper_mid;
1039  }
1040  if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing)
1041  continue;
1042 
1043  // Paragraphs should have similar fonts.
1044  if (!part->MatchingSizes(*upper_part) ||
1045  !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
1047  continue;
1048  }
1049 
1050  // The last line of a paragraph should be left aligned.
1051  // TODO(nbeato): This would be untrue if the text was right aligned.
1052  // How often is that?
1053  if (part->space_to_left() >
1054  kMaxParagraphEndingLeftSpaceMultiple * part->median_height())
1055  continue;
1056  // The line above it should be right aligned (assuming justified format).
1057  // Since we can't assume justified text, we compare whitespace to text.
1058  // The above line should have majority spanning text (or the current
1059  // line could have fit on the previous line). So compare
1060  // whitespace to text.
1061  if (upper_part->bounding_box().width() <
1062  kMinParagraphEndingTextToWhitespaceRatio * upper_part->space_to_right())
1063  continue;
1064 
1065  // Ledding above the line should be less than ledding below
1066  if (part->space_above() >= part->space_below() ||
1067  part->space_above() > 2 * global_median_ledding_)
1068  continue;
1069 
1070  // If all checks failed, it is probably text.
1071  part->clear_table_type();
1072  }
1073 }
const double kParagraphEndingPreviousLineRatio
Definition: tablefind.cpp:122
Definition: capi.h:100
const double kMinParagraphEndingTextToWhitespaceRatio
Definition: tablefind.cpp:132
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
const double kMaxParagraphEndingLeftSpaceMultiple
Definition: tablefind.cpp:126
const double kStrokeWidthConstantTolerance
Definition: tablefind.cpp:141
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
const double kStrokeWidthFractionalTolerance
Definition: tablefind.cpp:140

◆ FindNeighbors()

void tesseract::TableFinder::FindNeighbors ( )
protected

Definition at line 767 of file tablefind.cpp.

767  {
769  gsearch.StartFullSearch();
770  ColPartition* part = nullptr;
771  while ((part = gsearch.NextFullSearch()) != nullptr) {
772  // TODO(nbeato): Rename this function, meaning is different now.
773  // IT is finding nearest neighbors its own way
774  //SetVerticalSpacing(part);
775 
776  ColPartition* upper = part->SingletonPartner(true);
777  if (upper)
778  part->set_nearest_neighbor_above(upper);
779 
780  ColPartition* lower = part->SingletonPartner(false);
781  if (lower)
782  part->set_nearest_neighbor_below(lower);
783  }
784 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ GapInXProjection()

bool tesseract::TableFinder::GapInXProjection ( int *  xprojection,
int  length 
)
protected

Definition at line 1769 of file tablefind.cpp.

1769  {
1770  // Find peak value of the histogram
1771  int peak_value = 0;
1772  for (int i = 0; i < length; i++) {
1773  if (xprojection[i] > peak_value) {
1774  peak_value = xprojection[i];
1775  }
1776  }
1777  // Peak value represents the maximum number of horizontally
1778  // overlapping colpartitions, so this can be considered as the
1779  // number of rows in the table
1780  if (peak_value < kMinRowsInTable)
1781  return false;
1782  double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1783  if (peak_value >= kLargeTableRowCount)
1784  projection_threshold = kLargeTableProjectionThreshold * peak_value;
1785  // Threshold the histogram
1786  for (int i = 0; i < length; i++) {
1787  xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1788  }
1789  // Find the largest run of zeros between two ones
1790  int largest_gap = 0;
1791  int run_start = -1;
1792  for (int i = 1; i < length; i++) {
1793  // detect start of a run of zeros
1794  if (xprojection[i - 1] && !xprojection[i]) {
1795  run_start = i;
1796  }
1797  // detect end of a run of zeros and update the value of largest gap
1798  if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1799  int gap = i - run_start;
1800  if (gap > largest_gap)
1801  largest_gap = gap;
1802  run_start = -1;
1803  }
1804  }
1805  return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1806 }
const double kLargeTableProjectionThreshold
Definition: tablefind.cpp:107
const int kMinRowsInTable
Definition: tablefind.cpp:112
const double kSmallTableProjectionThreshold
Definition: tablefind.cpp:106
const int kLargeTableRowCount
Definition: tablefind.cpp:109
const double kMaxXProjectionGapFactor
Definition: tablefind.cpp:136

◆ GetColumnBlocks()

void tesseract::TableFinder::GetColumnBlocks ( ColPartitionSet **  columns,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 524 of file tablefind.cpp.

525  {
526  for (int i = 0; i < gridheight(); ++i) {
527  ColPartitionSet* columns = all_columns[i];
528  if (columns != nullptr) {
529  ColSegment_LIST new_blocks;
530  // Get boxes from the current vertical position on the grid
531  columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks);
532  // Merge the new_blocks boxes into column_blocks if they are well-aligned
533  GroupColumnBlocks(&new_blocks, column_blocks);
534  }
535  }
536 }
int gridheight() const
Definition: tablefind.cpp:385
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:539

◆ GetTableColumns()

void tesseract::TableFinder::GetTableColumns ( ColSegment_LIST *  table_columns)
protected

Definition at line 1274 of file tablefind.cpp.

1274  {
1275  ColSegment_IT it(table_columns);
1276  // Iterate the ColPartitions in the grid.
1277  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1278  gsearch(&clean_part_grid_);
1279  gsearch.StartFullSearch();
1280  ColPartition* part;
1281  while ((part = gsearch.NextFullSearch()) != nullptr) {
1282  if (part->inside_table_column() || part->type() != PT_TABLE)
1283  continue; // prevent a partition to be assigned to multiple columns
1284  const TBOX& box = part->bounding_box();
1285  ColSegment* col = new ColSegment();
1286  col->InsertBox(box);
1287  part->set_inside_table_column(true);
1288  // Start a search below the current cell to find bottom neighbours
1289  // Note: a full search will always process things above it first, so
1290  // this should be starting at the highest cell and working its way down.
1291  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1292  vsearch(&clean_part_grid_);
1293  vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1294  ColPartition* neighbor = nullptr;
1295  bool found_neighbours = false;
1296  while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) {
1297  // only consider neighbors not assigned to any column yet
1298  if (neighbor->inside_table_column())
1299  continue;
1300  // Horizontal lines should not break the flow
1301  if (neighbor->IsHorizontalLine())
1302  continue;
1303  // presence of a non-table neighbor marks the end of current
1304  // table column
1305  if (neighbor->type() != PT_TABLE)
1306  break;
1307  // add the neighbor partition to the table column
1308  const TBOX& neighbor_box = neighbor->bounding_box();
1309  col->InsertBox(neighbor_box);
1310  neighbor->set_inside_table_column(true);
1311  found_neighbours = true;
1312  }
1313  if (found_neighbours) {
1314  it.add_after_then_move(col);
1315  } else {
1316  part->set_inside_table_column(false);
1317  delete col;
1318  }
1319  }
1320 }
Definition: rect.h:34
Definition: capi.h:100
int16_t left() const
Definition: rect.h:72
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65

◆ GetTableRegions()

void tesseract::TableFinder::GetTableRegions ( ColSegment_LIST *  table_columns,
ColSegment_LIST *  table_regions 
)
protected

Definition at line 1324 of file tablefind.cpp.

1325  {
1326  ColSegment_IT cit(table_columns);
1327  ColSegment_IT rit(table_regions);
1328  // Iterate through column blocks
1329  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1330  gsearch(&col_seg_grid_);
1331  gsearch.StartFullSearch();
1332  ColSegment* part;
1333  int page_height = tright().y() - bleft().y();
1334  ASSERT_HOST(page_height > 0);
1335  // create a bool array to hold projection on y-axis
1336  bool* table_region = new bool[page_height];
1337  while ((part = gsearch.NextFullSearch()) != nullptr) {
1338  const TBOX& part_box = part->bounding_box();
1339  // reset the projection array
1340  for (int i = 0; i < page_height; i++) {
1341  table_region[i] = false;
1342  }
1343  // iterate through all table columns to find regions in the current
1344  // page column block
1345  cit.move_to_first();
1346  for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1347  TBOX col_box = cit.data()->bounding_box();
1348  // find intersection region of table column and page column
1349  TBOX intersection_box = col_box.intersection(part_box);
1350  // project table column on the y-axis
1351  for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1352  table_region[i - bleft().y()] = true;
1353  }
1354  }
1355  // set x-limits of table regions to page column width
1356  TBOX current_table_box;
1357  current_table_box.set_left(part_box.left());
1358  current_table_box.set_right(part_box.right());
1359  // go through the y-axis projection to find runs of table
1360  // regions. Each run makes one table region.
1361  for (int i = 1; i < page_height; i++) {
1362  // detect start of a table region
1363  if (!table_region[i - 1] && table_region[i]) {
1364  current_table_box.set_bottom(i + bleft().y());
1365  }
1366  // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1367  // detect end of a table region
1368  if (table_region[i - 1] && !table_region[i]) {
1369  current_table_box.set_top(i + bleft().y());
1370  if (!current_table_box.null_box()) {
1371  ColSegment* seg = new ColSegment();
1372  seg->InsertBox(current_table_box);
1373  rit.add_after_then_move(seg);
1374  }
1375  }
1376  }
1377  }
1378  delete[] table_region;
1379 }
const ICOORD & bleft() const
Definition: tablefind.cpp:388
void set_top(int y)
Definition: rect.h:61
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
bool null_box() const
Definition: rect.h:50
void set_bottom(int y)
Definition: rect.h:68
int16_t y() const
access_function
Definition: points.h:57
Definition: rect.h:34
void set_right(int x)
Definition: rect.h:82
const ICOORD & tright() const
Definition: tablefind.cpp:391
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:421
void set_left(int x)
Definition: rect.h:75
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ gridheight()

int tesseract::TableFinder::gridheight ( ) const
protected

Definition at line 385 of file tablefind.cpp.

385  {
386  return clean_part_grid_.gridheight();
387 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
int gridheight() const
Definition: bbgrid.h:70

◆ GridMergeColumnBlocks()

void tesseract::TableFinder::GridMergeColumnBlocks ( )
protected

Definition at line 1196 of file tablefind.cpp.

1196  {
1197  int margin = gridsize();
1198 
1199  // Iterate the Column Blocks in the grid.
1200  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1201  gsearch(&col_seg_grid_);
1202  gsearch.StartFullSearch();
1203  ColSegment* seg;
1204  while ((seg = gsearch.NextFullSearch()) != nullptr) {
1205  if (seg->type() != COL_TEXT)
1206  continue; // only consider text blocks for split detection
1207  bool neighbor_found = false;
1208  bool modified = false; // Modified at least once
1209  // keep expanding current box as long as neighboring table columns
1210  // are found above or below it.
1211  do {
1212  TBOX box = seg->bounding_box();
1213  // slightly expand the search region vertically
1214  int top_range = std::min(box.top() + margin, static_cast<int>(tright().y()));
1215  int bottom_range = std::max(box.bottom() - margin, static_cast<int>(bleft().y()));
1216  box.set_top(top_range);
1217  box.set_bottom(bottom_range);
1218  neighbor_found = false;
1219  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1220  rectsearch(&col_seg_grid_);
1221  rectsearch.StartRectSearch(box);
1222  ColSegment* neighbor = nullptr;
1223  while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1224  if (neighbor == seg)
1225  continue;
1226  const TBOX& neighbor_box = neighbor->bounding_box();
1227  // If the neighbor box significantly overlaps with the current
1228  // box (due to the expansion of the current box in the
1229  // previous iteration of this loop), remove the neighbor box
1230  // and expand the current box to include it.
1231  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1232  seg->InsertBox(neighbor_box);
1233  modified = true;
1234  rectsearch.RemoveBBox();
1235  gsearch.RepositionIterator();
1236  delete neighbor;
1237  continue;
1238  }
1239  // Only expand if the neighbor box is of table type
1240  if (neighbor->type() != COL_TABLE)
1241  continue;
1242  // Insert the neighbor box into the current column block
1243  if (neighbor_box.major_x_overlap(box) &&
1244  !box.contains(neighbor_box)) {
1245  seg->InsertBox(neighbor_box);
1246  neighbor_found = true;
1247  modified = true;
1248  rectsearch.RemoveBBox();
1249  gsearch.RepositionIterator();
1250  delete neighbor;
1251  }
1252  }
1253  } while (neighbor_found);
1254  if (modified) {
1255  // Because the box has changed, it has to be removed first.
1256  gsearch.RemoveBBox();
1257  col_seg_grid_.InsertBBox(true, true, seg);
1258  gsearch.RepositionIterator();
1259  }
1260  }
1261 }
const ICOORD & bleft() const
Definition: tablefind.cpp:388
void set_top(int y)
Definition: rect.h:61
void set_bottom(int y)
Definition: rect.h:68
int16_t y() const
access_function
Definition: points.h:57
Definition: rect.h:34
double overlap_fraction(const TBOX &box) const
Definition: rect.h:388
const ICOORD & tright() const
Definition: tablefind.cpp:391
int16_t top() const
Definition: rect.h:58
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:488
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:421
bool contains(const FCOORD pt) const
Definition: rect.h:333
int16_t bottom() const
Definition: rect.h:65

◆ GridMergeTableRegions()

void tesseract::TableFinder::GridMergeTableRegions ( )
protected

Definition at line 1387 of file tablefind.cpp.

1387  {
1388  // Iterate the table regions in the grid.
1389  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1390  gsearch(&table_grid_);
1391  gsearch.StartFullSearch();
1392  ColSegment* seg = nullptr;
1393  while ((seg = gsearch.NextFullSearch()) != nullptr) {
1394  bool neighbor_found = false;
1395  bool modified = false; // Modified at least once
1396  do {
1397  // Start a rectangle search x-bounded by the image and y by the table
1398  const TBOX& box = seg->bounding_box();
1399  TBOX search_region(box);
1400  search_region.set_left(bleft().x());
1401  search_region.set_right(tright().x());
1402  neighbor_found = false;
1403  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1404  rectsearch(&table_grid_);
1405  rectsearch.StartRectSearch(search_region);
1406  ColSegment* neighbor = nullptr;
1407  while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1408  if (neighbor == seg)
1409  continue;
1410  const TBOX& neighbor_box = neighbor->bounding_box();
1411  // Check if a neighbor box has a large overlap with the table
1412  // region. This may happen as a result of merging two table
1413  // regions in the previous iteration.
1414  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1415  seg->InsertBox(neighbor_box);
1416  rectsearch.RemoveBBox();
1417  gsearch.RepositionIterator();
1418  delete neighbor;
1419  modified = true;
1420  continue;
1421  }
1422  // Check if two table regions belong together based on a common
1423  // horizontal ruling line
1424  if (BelongToOneTable(box, neighbor_box)) {
1425  seg->InsertBox(neighbor_box);
1426  neighbor_found = true;
1427  modified = true;
1428  rectsearch.RemoveBBox();
1429  gsearch.RepositionIterator();
1430  delete neighbor;
1431  }
1432  }
1433  } while (neighbor_found);
1434  if (modified) {
1435  // Because the box has changed, it has to be removed first.
1436  gsearch.RemoveBBox();
1437  table_grid_.InsertBBox(true, true, seg);
1438  gsearch.RepositionIterator();
1439  }
1440  }
1441 }
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
Definition: tablefind.cpp:1445
const ICOORD & bleft() const
Definition: tablefind.cpp:388
Definition: rect.h:34
double overlap_fraction(const TBOX &box) const
Definition: rect.h:388
const ICOORD & tright() const
Definition: tablefind.cpp:391
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:488
ColSegmentGrid table_grid_
Definition: tablefind.h:423

◆ gridsize()

int tesseract::TableFinder::gridsize ( ) const
protected

Definition at line 379 of file tablefind.cpp.

379  {
380  return clean_part_grid_.gridsize();
381 }
int gridsize() const
Definition: bbgrid.h:64
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ gridwidth()

int tesseract::TableFinder::gridwidth ( ) const
protected

Definition at line 382 of file tablefind.cpp.

382  {
383  return clean_part_grid_.gridwidth();
384 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
int gridwidth() const
Definition: bbgrid.h:67

◆ GroupColumnBlocks()

void tesseract::TableFinder::GroupColumnBlocks ( ColSegment_LIST *  current_segments,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 539 of file tablefind.cpp.

540  {
541  ColSegment_IT src_it(new_blocks);
542  ColSegment_IT dest_it(column_blocks);
543  // iterate through the source list
544  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
545  ColSegment* src_seg = src_it.data();
546  const TBOX& src_box = src_seg->bounding_box();
547  bool match_found = false;
548  // iterate through the destination list to find a matching column block
549  for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
550  ColSegment* dest_seg = dest_it.data();
551  TBOX dest_box = dest_seg->bounding_box();
552  if (ConsecutiveBoxes(src_box, dest_box)) {
553  // If matching block is found, insert the current block into it
554  // and delete the source block.
555  dest_seg->InsertBox(src_box);
556  match_found = true;
557  delete src_it.extract();
558  break;
559  }
560  }
561  // If no match is found, just append the source block to column_blocks
562  if (!match_found) {
563  dest_it.add_after_then_move(src_it.extract());
564  }
565  }
566 }
Definition: rect.h:34
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
Definition: tablefind.cpp:569

◆ GrowTableBox()

void tesseract::TableFinder::GrowTableBox ( const TBOX table_box,
TBOX result_box 
)
protected

Definition at line 1521 of file tablefind.cpp.

1521  {
1522  // TODO(nbeato): The growing code is a bit excessive right now.
1523  // By removing these lines, the partitions considered need
1524  // to have some overlap or be special cases. These lines could
1525  // be added again once a check is put in place to make sure that
1526  // growing tables don't stomp on a lot of non-table partitions.
1527 
1528  // search for horizontal ruling lines within the vertical margin
1529  // int vertical_margin = kRulingVerticalMargin * gridsize();
1530  TBOX search_box = table_box;
1531  // int top = MIN(search_box.top() + vertical_margin, tright().y());
1532  // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1533  // search_box.set_top(top);
1534  // search_box.set_bottom(bottom);
1535 
1536  GrowTableToIncludePartials(table_box, search_box, result_box);
1537  GrowTableToIncludeLines(table_box, search_box, result_box);
1538  IncludeLeftOutColumnHeaders(result_box);
1539 }
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1571
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1543
Definition: rect.h:34
void IncludeLeftOutColumnHeaders(TBOX *table_box)
Definition: tablefind.cpp:1665

◆ GrowTableToIncludeLines()

void tesseract::TableFinder::GrowTableToIncludeLines ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1571 of file tablefind.cpp.

1573  {
1575  rsearch.SetUniqueMode(true);
1576  rsearch.StartRectSearch(search_range);
1577  ColPartition* part = nullptr;
1578  while ((part = rsearch.NextRectSearch()) != nullptr) {
1579  // TODO(nbeato) This should also do vertical, but column
1580  // boundaries are breaking things. This function needs to be
1581  // updated to allow vertical lines as well.
1582  if (!part->IsLineType())
1583  continue;
1584  // Avoid the following function call if the result of the
1585  // function is irrelevant.
1586  const TBOX& part_box = part->bounding_box();
1587  if (result_box->contains(part_box))
1588  continue;
1589  // Include a partially overlapping horizontal line only if the
1590  // extra ColPartitions that will be included due to expansion
1591  // have large side spacing w.r.t. columns containing them.
1592  if (HLineBelongsToTable(*part, table_box))
1593  *result_box = result_box->bounding_union(part_box);
1594  // TODO(nbeato): Vertical
1595  }
1596 }
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
Definition: rect.h:34
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
bool contains(const FCOORD pt) const
Definition: rect.h:333
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
Definition: tablefind.cpp:1601

◆ GrowTableToIncludePartials()

void tesseract::TableFinder::GrowTableToIncludePartials ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1543 of file tablefind.cpp.

1545  {
1546  // Rulings are in a different grid, so search 2 grids for rulings, text,
1547  // and table partitions that are not entirely within the new box.
1548  for (int i = 0; i < 2; ++i) {
1549  ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ :
1551  ColPartitionGridSearch rectsearch(grid);
1552  rectsearch.StartRectSearch(search_range);
1553  ColPartition* part = nullptr;
1554  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1555  // Only include text and table types.
1556  if (part->IsImageType())
1557  continue;
1558  const TBOX& part_box = part->bounding_box();
1559  // Include partition in the table if more than half of it
1560  // is covered by the table
1561  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1562  *result_box = result_box->bounding_union(part_box);
1563  continue;
1564  }
1565  }
1566  }
1567 }
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
Definition: rect.h:34
const double kMinOverlapWithTable
Definition: tablefind.cpp:97
double overlap_fraction(const TBOX &box) const
Definition: rect.h:388
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419

◆ HasLeaderAdjacent()

bool tesseract::TableFinder::HasLeaderAdjacent ( const ColPartition part)
protected

Definition at line 947 of file tablefind.cpp.

947  {
948  if (part.flow() == BTFT_LEADER)
949  return true;
950  // Search range is left and right bounded by an offset of the
951  // median xheight. This offset is to allow some tolerance to the
952  // the leaders on the page in the event that the alignment is still
953  // a bit off.
954  const TBOX& box = part.bounding_box();
955  const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
956  const int top = box.top() + search_size;
957  const int bottom = box.bottom() - search_size;
959  for (int direction = 0; direction < 2; ++direction) {
960  bool right_to_left = (direction == 0);
961  int x = right_to_left ? box.right() : box.left();
962  hsearch.StartSideSearch(x, bottom, top);
963  ColPartition* leader = nullptr;
964  while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) {
965  // The leader could be a horizontal ruling in the grid.
966  // Make sure it is actually a leader.
967  if (leader->flow() != BTFT_LEADER)
968  continue;
969  // This should not happen, they are in different grids.
970  ASSERT_HOST(&part != leader);
971  // Make sure the leader shares a page column with the partition,
972  // otherwise we are spreading across columns.
973  if (!part.IsInSameColumnAs(*leader))
974  break;
975  // There should be a significant vertical overlap
976  if (!leader->VSignificantCoreOverlap(part))
977  continue;
978  // Leader passed all tests, so it is adjacent.
979  return true;
980  }
981  }
982  // No leaders are adjacent to the given partition.
983  return false;
984 }
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
Definition: rect.h:34
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
int16_t left() const
Definition: rect.h:72
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
int16_t top() const
Definition: rect.h:58
int16_t right() const
Definition: rect.h:79
const int kAdjacentLeaderSearchPadding
Definition: tablefind.cpp:117
int16_t bottom() const
Definition: rect.h:65
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ HasWideOrNoInterWordGap()

bool tesseract::TableFinder::HasWideOrNoInterWordGap ( ColPartition part) const
protected

Definition at line 858 of file tablefind.cpp.

858  {
859  // Should only get text partitions.
860  ASSERT_HOST(part->IsTextType());
861  // Blob access
862  BLOBNBOX_CLIST* part_boxes = part->boxes();
863  BLOBNBOX_C_IT it(part_boxes);
864  // Check if this is a relatively small partition (such as a single word)
865  if (part->bounding_box().width() <
866  kMinBoxesInTextPartition * part->median_height() &&
867  part_boxes->length() < kMinBoxesInTextPartition)
868  return true;
869 
870  // Variables used to compute inter-blob spacing.
871  int current_x0 = -1;
872  int current_x1 = -1;
873  int previous_x1 = -1;
874  // Stores the maximum gap detected.
875  int largest_partition_gap_found = -1;
876  // Text partition gap limits. If this is text (and not a table),
877  // there should be at least one gap larger than min_gap and no gap
878  // larger than max_gap.
879  const double max_gap = kMaxGapInTextPartition * part->median_height();
880  const double min_gap = kMinMaxGapInTextPartition * part->median_height();
881 
882  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
883  BLOBNBOX* blob = it.data();
884  current_x0 = blob->bounding_box().left();
885  current_x1 = blob->bounding_box().right();
886  if (previous_x1 != -1) {
887  int gap = current_x0 - previous_x1;
888 
889  // TODO(nbeato): Boxes may overlap? Huh?
890  // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
891  // on the top right of the page are filtered out with this line.
892  // Note 2: Iterating over blobs in a partition, so we are looking for
893  // spacing between the words.
894  if (gap < 0) {
895  // More likely case, the blobs slightly overlap. This can happen
896  // with diacritics (accents) or broken alphabet symbols (characters).
897  // Merge boxes together by taking max of right sides.
898  if (-gap < part->median_height() * kMaxBlobOverlapFactor) {
899  previous_x1 = std::max(previous_x1, current_x1);
900  continue;
901  }
902  // Extreme case, blobs overlap significantly in the same partition...
903  // This should not happen often (if at all), but it does.
904  // TODO(nbeato): investigate cases when this happens.
905  else {
906  // The behavior before was to completely ignore this case.
907  }
908  }
909 
910  // If a large enough gap is found, mark it as a table cell (return true)
911  if (gap > max_gap)
912  return true;
913  if (gap > largest_partition_gap_found)
914  largest_partition_gap_found = gap;
915  }
916  previous_x1 = current_x1;
917  }
918  // Since no large gap was found, return false if the partition is too
919  // long to be a data cell
920  if (part->bounding_box().width() >
921  kMaxBoxesInDataPartition * part->median_height() ||
922  part_boxes->length() > kMaxBoxesInDataPartition)
923  return false;
924 
925  // A partition may be a single blob. In this case, it's an isolated symbol
926  // or non-text (such as a ruling or image).
927  // Detect these as table partitions? Shouldn't this be case by case?
928  // The behavior before was to ignore this, making max_partition_gap < 0
929  // and implicitly return true. Just making it explicit.
930  if (largest_partition_gap_found == -1)
931  return true;
932 
933  // return true if the maximum gap found is smaller than the minimum allowed
934  // max_gap in a text partition. This indicates that there is no significant
935  // space in the partition, hence it is likely a single word.
936  return largest_partition_gap_found < min_gap;
937 }
const double kMaxGapInTextPartition
Definition: tablefind.cpp:69
const int kMaxBoxesInDataPartition
Definition: tablefind.cpp:66
const double kMinMaxGapInTextPartition
Definition: tablefind.cpp:73
const double kMaxBlobOverlapFactor
Definition: tablefind.cpp:77
const int kMinBoxesInTextPartition
Definition: tablefind.cpp:63
int16_t left() const
Definition: rect.h:72
const TBOX & bounding_box() const
Definition: blobbox.h:231
int16_t right() const
Definition: rect.h:79
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ HLineBelongsToTable()

bool tesseract::TableFinder::HLineBelongsToTable ( const ColPartition part,
const TBOX table_box 
)
protected

Definition at line 1601 of file tablefind.cpp.

1602  {
1603  if (!part.IsHorizontalLine())
1604  return false;
1605  const TBOX& part_box = part.bounding_box();
1606  if (!part_box.major_x_overlap(table_box))
1607  return false;
1608  // Do not consider top-most horizontal line since it usually
1609  // originates from noise.
1610  // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1611  // have neighbors solved.
1612  // if (!part.nearest_neighbor_above())
1613  // return false;
1614  const TBOX bbox = part_box.bounding_union(table_box);
1615  // In the "unioned table" box (the table extents expanded by the line),
1616  // keep track of how many partitions have significant padding to the left
1617  // and right. If more than half of the partitions covered by the new table
1618  // have significant spacing, the line belongs to the table and the table
1619  // grows to include all of the partitions.
1620  int num_extra_partitions = 0;
1621  int extra_space_to_right = 0;
1622  int extra_space_to_left = 0;
1623  // Rulings are in a different grid, so search 2 grids for rulings, text,
1624  // and table partitions that are introduced by the new box.
1625  for (int i = 0; i < 2; ++i) {
1626  ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ :
1628  // Start a rect search on bbox
1629  ColPartitionGridSearch rectsearch(grid);
1630  rectsearch.SetUniqueMode(true);
1631  rectsearch.StartRectSearch(bbox);
1632  ColPartition* extra_part = nullptr;
1633  while ((extra_part = rectsearch.NextRectSearch()) != nullptr) {
1634  // ColPartition already in table
1635  const TBOX& extra_part_box = extra_part->bounding_box();
1636  if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable)
1637  continue;
1638  // Non-text ColPartitions do not contribute
1639  if (extra_part->IsImageType())
1640  continue;
1641  // Consider this partition.
1642  num_extra_partitions++;
1643  // presence of a table cell is a strong hint, so just increment the scores
1644  // without looking at the spacing.
1645  if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1646  extra_space_to_right++;
1647  extra_space_to_left++;
1648  continue;
1649  }
1650  int space_threshold = kSideSpaceMargin * part.median_height();
1651  if (extra_part->space_to_right() > space_threshold)
1652  extra_space_to_right++;
1653  if (extra_part->space_to_left() > space_threshold)
1654  extra_space_to_left++;
1655  }
1656  }
1657  // tprintf("%d %d %d\n",
1658  // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1659  return (extra_space_to_right > num_extra_partitions / 2) ||
1660  (extra_space_to_left > num_extra_partitions / 2);
1661 }
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
Definition: rect.h:34
Definition: capi.h:100
const double kMinOverlapWithTable
Definition: tablefind.cpp:97
double overlap_fraction(const TBOX &box) const
Definition: rect.h:388
const int kSideSpaceMargin
Definition: tablefind.cpp:102
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129

◆ IncludeLeftOutColumnHeaders()

void tesseract::TableFinder::IncludeLeftOutColumnHeaders ( TBOX table_box)
protected

Definition at line 1665 of file tablefind.cpp.

1665  {
1666  // Start a search above the current table to look for column headers
1668  vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1669  table_box->top());
1670  ColPartition* neighbor = nullptr;
1671  ColPartition* previous_neighbor = nullptr;
1672  while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) {
1673  // Max distance to find a table heading.
1674  const int max_distance = kMaxColumnHeaderDistance *
1675  neighbor->median_height();
1676  int table_top = table_box->top();
1677  const TBOX& box = neighbor->bounding_box();
1678  // Do not continue if the next box is way above
1679  if (box.bottom() - table_top > max_distance)
1680  break;
1681  // Unconditionally include partitions of type TABLE or LINE
1682  // TODO(faisal): add some reasonable conditions here
1683  if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1684  table_box->set_top(box.top());
1685  previous_neighbor = nullptr;
1686  continue;
1687  }
1688  // If there are two text partitions, one above the other, without a table
1689  // cell on their left or right side, consider them a barrier and quit
1690  if (previous_neighbor == nullptr) {
1691  previous_neighbor = neighbor;
1692  } else {
1693  const TBOX& previous_box = previous_neighbor->bounding_box();
1694  if (!box.major_y_overlap(previous_box))
1695  break;
1696  }
1697  }
1698 }
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:439
void set_top(int y)
Definition: rect.h:61
Definition: rect.h:34
Definition: capi.h:100
const int kMaxColumnHeaderDistance
Definition: tablefind.cpp:85
int16_t left() const
Definition: rect.h:72
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
int16_t top() const
Definition: rect.h:58
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65

◆ Init()

void tesseract::TableFinder::Init ( int  grid_size,
const ICOORD bottom_left,
const ICOORD top_right 
)

Definition at line 182 of file tablefind.cpp.

183  {
184  // Initialize clean partitions list and grid
185  clean_part_grid_.Init(grid_size, bottom_left, top_right);
186  leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
187  fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
188  col_seg_grid_.Init(grid_size, bottom_left, top_right);
189  table_grid_.Init(grid_size, bottom_left, top_right);
190 }
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:421
ColSegmentGrid table_grid_
Definition: tablefind.h:423
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:447
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419

◆ InitializePartitions()

void tesseract::TableFinder::InitializePartitions ( ColPartitionSet **  all_columns)
protected

Definition at line 580 of file tablefind.cpp.

580  {
581  FindNeighbors();
582  SetPartitionSpacings(&clean_part_grid_, all_columns);
584 }
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:710
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
Definition: tablefind.cpp:587
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ InsertCleanPartitions()

void tesseract::TableFinder::InsertCleanPartitions ( ColPartitionGrid grid,
TO_BLOCK block 
)

Definition at line 194 of file tablefind.cpp.

195  {
196  // Calculate stats. This lets us filter partitions in AllowTextPartition()
197  // and filter blobs in AllowBlob().
198  SetGlobalSpacings(grid);
199 
200  // Iterate the ColPartitions in the grid.
201  ColPartitionGridSearch gsearch(grid);
202  gsearch.SetUniqueMode(true);
203  gsearch.StartFullSearch();
204  ColPartition* part = nullptr;
205  while ((part = gsearch.NextFullSearch()) != nullptr) {
206  // Reject partitions with nothing useful inside of them.
207  if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0)
208  continue;
209  ColPartition* clean_part = part->ShallowCopy();
210  ColPartition* leader_part = nullptr;
211  if (part->IsLineType()) {
212  InsertRulingPartition(clean_part);
213  continue;
214  }
215  // Insert all non-text partitions to clean_parts
216  if (!part->IsTextType()) {
217  InsertImagePartition(clean_part);
218  continue;
219  }
220  // Insert text colpartitions after removing noisy components from them
221  // The leaders are split into a separate grid.
222  BLOBNBOX_CLIST* part_boxes = part->boxes();
223  BLOBNBOX_C_IT pit(part_boxes);
224  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
225  BLOBNBOX *pblob = pit.data();
226  // Bad blobs... happens in UNLV set.
227  // news.3G1, page 17 (around x=6)
228  if (!AllowBlob(*pblob))
229  continue;
230  if (pblob->flow() == BTFT_LEADER) {
231  if (leader_part == nullptr) {
232  leader_part = part->ShallowCopy();
233  leader_part->set_flow(BTFT_LEADER);
234  }
235  leader_part->AddBox(pblob);
236  } else if (pblob->region_type() != BRT_NOISE) {
237  clean_part->AddBox(pblob);
238  }
239  }
240  clean_part->ComputeLimits();
241  ColPartition* fragmented = clean_part->CopyButDontOwnBlobs();
242  InsertTextPartition(clean_part);
244  if (leader_part != nullptr) {
245  // TODO(nbeato): Note that ComputeLimits does not update the column
246  // information. So the leader may appear to span more columns than it
247  // really does later on when IsInSameColumnAs gets called to test
248  // for adjacent leaders.
249  leader_part->ComputeLimits();
250  InsertLeaderPartition(leader_part);
251  }
252  }
253 
254  // Make the partition partners better for upper and lower neighbors.
257 }
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:437
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:710
BlobTextFlowType flow() const
Definition: blobbox.h:296
bool AllowBlob(const BLOBNBOX &blob) const
Definition: tablefind.cpp:503
void InsertTextPartition(ColPartition *part)
Definition: tablefind.cpp:395
void InsertRulingPartition(ColPartition *part)
Definition: tablefind.cpp:419
void InsertLeaderPartition(ColPartition *part)
Definition: tablefind.cpp:411
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
BlobRegionType region_type() const
Definition: blobbox.h:284
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:299
void RefinePartitionPartners(bool get_desperate)
void InsertImagePartition(ColPartition *part)
Definition: tablefind.cpp:422

◆ InsertFragmentedTextPartition()

void tesseract::TableFinder::InsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 403 of file tablefind.cpp.

403  {
404  ASSERT_HOST(part != nullptr);
405  if (AllowTextPartition(*part)) {
406  fragmented_text_grid_.InsertBBox(true, true, part);
407  } else {
408  delete part;
409  }
410 }
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:488
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:490
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ InsertImagePartition()

void tesseract::TableFinder::InsertImagePartition ( ColPartition part)
protected

Definition at line 422 of file tablefind.cpp.

422  {
423  // NOTE: If images are placed into a different grid in the future,
424  // the function SetPartitionSpacings needs to be updated. It should
425  // be the only thing that cares about image partitions.
426  clean_part_grid_.InsertBBox(true, true, part);
427 }
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:488
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ InsertLeaderPartition()

void tesseract::TableFinder::InsertLeaderPartition ( ColPartition part)
protected

Definition at line 411 of file tablefind.cpp.

411  {
412  ASSERT_HOST(part != nullptr);
413  if (!part->IsEmpty() && part->bounding_box().area() > 0) {
414  leader_and_ruling_grid_.InsertBBox(true, true, part);
415  } else {
416  delete part;
417  }
418 }
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:488
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ InsertRulingPartition()

void tesseract::TableFinder::InsertRulingPartition ( ColPartition part)
protected

Definition at line 419 of file tablefind.cpp.

419  {
420  leader_and_ruling_grid_.InsertBBox(true, true, part);
421 }
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:488

◆ InsertTextPartition()

void tesseract::TableFinder::InsertTextPartition ( ColPartition part)
protected

Definition at line 395 of file tablefind.cpp.

395  {
396  ASSERT_HOST(part != nullptr);
397  if (AllowTextPartition(*part)) {
398  clean_part_grid_.InsertBBox(true, true, part);
399  } else {
400  delete part;
401  }
402 }
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:488
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:490
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ LocateTables()

void tesseract::TableFinder::LocateTables ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback width_cb,
const FCOORD reskew 
)

Definition at line 260 of file tablefind.cpp.

263  {
264  // initialize spacing, neighbors, and columns
265  InitializePartitions(all_columns);
266 
267 #ifndef GRAPHICS_DISABLED
268  if (textord_show_tables) {
269  ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
275 
276  table_win = MakeWindow(100, 300, "Fragmented Text");
278  }
279 #endif // GRAPHICS_DISABLED
280 
281  // mark, filter, and smooth candidate table partitions
283 
284  // Make single-column blocks from good_columns_ partitions. col_segments are
285  // moved to a grid later which takes the ownership
286  ColSegment_LIST column_blocks;
287  GetColumnBlocks(all_columns, &column_blocks);
288  // Set the ratio of candidate table partitions in each column
289  SetColumnsType(&column_blocks);
290 
291  // Move column segments to col_seg_grid_
292  MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
293 
294  // Detect split in column layout that might have occurred due to the
295  // presence of a table. In such a case, merge the corresponding columns.
297 
298  // Group horizontally overlapping table partitions into table columns.
299  // table_columns created here get deleted at the end of this method.
300  ColSegment_LIST table_columns;
301  GetTableColumns(&table_columns);
302 
303  // Within each column, mark the range table regions occupy based on the
304  // table columns detected. table_regions are moved to a grid later which
305  // takes the ownership
306  ColSegment_LIST table_regions;
307  GetTableRegions(&table_columns, &table_regions);
308 
309 #ifndef GRAPHICS_DISABLED
311  ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions");
312  DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
313  DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
314  }
315 #endif // GRAPHICS_DISABLED
316 
317  // Merge table regions across columns for tables spanning multiple
318  // columns
319  MoveColSegmentsToGrid(&table_regions, &table_grid_);
321 
322  // Adjust table boundaries by including nearby horizontal lines and left
323  // out column headers
326 
328  // Remove false alarms consiting of a single column
330 
331 #ifndef GRAPHICS_DISABLED
332  if (textord_show_tables) {
333  ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations");
335  DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
336  table_grid_.DisplayBoxes(table_win);
337  }
338 #endif // GRAPHICS_DISABLED
339 
340  // Find table grid structure and reject tables that are malformed.
341  RecognizeTables();
343  RecognizeTables();
344 
345 #ifndef GRAPHICS_DISABLED
346  if (textord_show_tables) {
347  ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables");
350  table_grid_.DisplayBoxes(table_win);
351  }
352 #endif // GRAPHICS_DISABLED
353  } else {
354  // Remove false alarms consiting of a single column
355  // TODO(nbeato): verify this is a NOP after structured table rejection.
356  // Right now it isn't. If the recognize function is doing what it is
357  // supposed to do, this function is obsolete.
359 
360 #ifndef GRAPHICS_DISABLED
361  if (textord_show_tables) {
362  ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
365  table_grid_.DisplayBoxes(table_win);
366  }
367 #endif // GRAPHICS_DISABLED
368  }
369 
370  // Merge all colpartitions in table regions to make them a single
371  // colpartition and revert types of isolated table cells not
372  // assigned to any table to their original types.
373  MakeTableBlocks(grid, all_columns, width_cb);
374 }
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1917
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
Definition: tablefind.cpp:1951
bool textord_show_tables
Definition: tablefind.cpp:143
void SetColumnsType(ColSegment_LIST *col_segments)
Definition: tablefind.cpp:1144
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:519
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
Definition: tablefind.cpp:1872
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:524
bool textord_tablefind_show_mark
Definition: tablefind.cpp:145
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
Definition: tablefind.cpp:1177
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:421
ColSegmentGrid table_grid_
Definition: tablefind.h:423
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
Definition: tablefind.cpp:1998
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
bool textord_tablefind_recognize_tables
Definition: tablefind.cpp:149
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
Definition: tablefind.cpp:1324
void InitializePartitions(ColPartitionSet **all_columns)
Definition: tablefind.cpp:580
void GetTableColumns(ColSegment_LIST *table_columns)
Definition: tablefind.cpp:1274
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:615

◆ MakeTableBlocks()

void tesseract::TableFinder::MakeTableBlocks ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback width_cb 
)
protected

Definition at line 1998 of file tablefind.cpp.

2000  {
2001  // Since we have table blocks already, remove table tags from all
2002  // colpartitions
2003  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2004  gsearch(grid);
2005  gsearch.StartFullSearch();
2006  ColPartition* part = nullptr;
2007 
2008  while ((part = gsearch.NextFullSearch()) != nullptr) {
2009  if (part->type() == PT_TABLE) {
2010  part->clear_table_type();
2011  }
2012  }
2013  // Now make a single colpartition out of each table block and remove
2014  // all colpartitions contained within a table
2015  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
2016  table_search(&table_grid_);
2017  table_search.StartFullSearch();
2018  ColSegment* table;
2019  while ((table = table_search.NextFullSearch()) != nullptr) {
2020  const TBOX& table_box = table->bounding_box();
2021  // Start a rect search on table_box
2022  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2023  rectsearch(grid);
2024  rectsearch.StartRectSearch(table_box);
2025  ColPartition* part;
2026  ColPartition* table_partition = nullptr;
2027  while ((part = rectsearch.NextRectSearch()) != nullptr) {
2028  // Do not consider image partitions
2029  if (!part->IsTextType())
2030  continue;
2031  TBOX part_box = part->bounding_box();
2032  // Include partition in the table if more than half of it
2033  // is covered by the table
2034  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2035  rectsearch.RemoveBBox();
2036  if (table_partition) {
2037  table_partition->Absorb(part, width_cb);
2038  } else {
2039  table_partition = part;
2040  }
2041  }
2042  }
2043  // Insert table colpartition back to part_grid_
2044  if (table_partition) {
2045  // To match the columns used when transforming to blocks, the new table
2046  // partition must have its first and last column set at the grid y that
2047  // corresponds to its bottom.
2048  const TBOX& table_box = table_partition->bounding_box();
2049  int grid_x, grid_y;
2050  grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2051  table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2052  table_partition->set_table_type();
2053  table_partition->set_blob_type(BRT_TEXT);
2054  table_partition->set_flow(BTFT_CHAIN);
2055  table_partition->SetBlobTypes();
2056  grid->InsertBBox(true, true, table_partition);
2057  }
2058  }
2059 }
Definition: rect.h:34
Definition: capi.h:100
const double kMinOverlapWithTable
Definition: tablefind.cpp:97
double overlap_fraction(const TBOX &box) const
Definition: rect.h:388
int16_t left() const
Definition: rect.h:72
ColSegmentGrid table_grid_
Definition: tablefind.h:423
int16_t bottom() const
Definition: rect.h:65

◆ MakeWindow()

ScrollView * tesseract::TableFinder::MakeWindow ( int  x,
int  y,
const char *  window_name 
)
protected

Definition at line 519 of file tablefind.cpp.

519  {
520  return clean_part_grid_.MakeWindow(x, y, window_name);
521 }
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:591
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ MarkPartitionsUsingLocalInformation()

void tesseract::TableFinder::MarkPartitionsUsingLocalInformation ( )
protected

Definition at line 828 of file tablefind.cpp.

828  {
829  // Iterate the ColPartitions in the grid.
830  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
831  gsearch(&clean_part_grid_);
832  gsearch.StartFullSearch();
833  ColPartition* part = nullptr;
834  while ((part = gsearch.NextFullSearch()) != nullptr) {
835  if (!part->IsTextType()) // Only consider text partitions
836  continue;
837  // Only consider partitions in dominant font size or smaller
838  if (part->median_height() > kMaxTableCellXheight * global_median_xheight_)
839  continue;
840  // Mark partitions with a large gap, or no significant gap as
841  // table partitions.
842  // Comments: It produces several false alarms at:
843  // - last line of a paragraph (fixed)
844  // - single word section headings
845  // - page headers and footers
846  // - numbered equations
847  // - line drawing regions
848  // TODO(faisal): detect and fix above-mentioned cases
849  if (HasWideOrNoInterWordGap(part) ||
850  HasLeaderAdjacent(*part)) {
851  part->set_table_type();
852  }
853  }
854 }
const double kMaxTableCellXheight
Definition: tablefind.cpp:81
bool HasLeaderAdjacent(const ColPartition &part)
Definition: tablefind.cpp:947
bool HasWideOrNoInterWordGap(ColPartition *part) const
Definition: tablefind.cpp:858
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ MarkTablePartitions()

void tesseract::TableFinder::MarkTablePartitions ( )
protected

Definition at line 790 of file tablefind.cpp.

790  {
793  ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions");
797  }
800  ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions");
804  }
807  ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
811  }
814  ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions");
818  }
819 }
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1917
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
bool textord_show_tables
Definition: tablefind.cpp:143
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:519
void MarkPartitionsUsingLocalInformation()
Definition: tablefind.cpp:828
bool textord_tablefind_show_mark
Definition: tablefind.cpp:145
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ MoveColSegmentsToGrid()

void tesseract::TableFinder::MoveColSegmentsToGrid ( ColSegment_LIST *  segments,
ColSegmentGrid col_seg_grid 
)
protected

Definition at line 1177 of file tablefind.cpp.

1178  {
1179  ColSegment_IT it(segments);
1180  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1181  ColSegment* seg = it.extract();
1182  col_seg_grid->InsertBBox(true, true, seg);
1183  }
1184 }

◆ RecognizeTables()

void tesseract::TableFinder::RecognizeTables ( )
protected

Definition at line 1818 of file tablefind.cpp.

1818  {
1819  ScrollView* table_win = nullptr;
1820  if (textord_show_tables) {
1821  table_win = MakeWindow(0, 0, "Table Structure");
1824  // table_grid_.DisplayBoxes(table_win);
1825  }
1826 
1827 
1828  TableRecognizer recognizer;
1829  recognizer.Init();
1830  recognizer.set_line_grid(&leader_and_ruling_grid_);
1831  recognizer.set_text_grid(&fragmented_text_grid_);
1832  recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1833  recognizer.set_min_height(1.5 * gridheight());
1834  // Loop over all of the tables and try to fit them.
1835  // Store the good tables here.
1836  ColSegment_CLIST good_tables;
1837  ColSegment_C_IT good_it(&good_tables);
1838 
1840  gsearch.StartFullSearch();
1841  ColSegment* found_table = nullptr;
1842  while ((found_table = gsearch.NextFullSearch()) != nullptr) {
1843  gsearch.RemoveBBox();
1844 
1845  // The goal is to make the tables persistent in a list.
1846  // When that happens, this will move into the search loop.
1847  const TBOX& found_box = found_table->bounding_box();
1848  StructuredTable* table_structure = recognizer.RecognizeTable(found_box);
1849 
1850  // Process a table. Good tables are inserted into the grid again later on
1851  // We can't change boxes in the grid while it is running a search.
1852  if (table_structure != nullptr) {
1853  if (textord_show_tables) {
1854  table_structure->Display(table_win, ScrollView::LIME_GREEN);
1855  }
1856  found_table->set_bounding_box(table_structure->bounding_box());
1857  delete table_structure;
1858  good_it.add_after_then_move(found_table);
1859  } else {
1860  delete found_table;
1861  }
1862  }
1863  // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1864 
1865  // At this point, the grid is empty. We can safely insert the good tables
1866  // back into grid.
1867  for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
1868  table_grid_.InsertBBox(true, true, good_it.extract());
1869 }
int gridheight() const
Definition: tablefind.cpp:385
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1917
GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > ColSegmentGridSearch
Definition: tablefind.h:121
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
Definition: rect.h:34
bool textord_show_tables
Definition: tablefind.cpp:143
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:519
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:488
ColSegmentGrid table_grid_
Definition: tablefind.h:423
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419

◆ set_global_median_blob_width()

void tesseract::TableFinder::set_global_median_blob_width ( int  width)
protected

Definition at line 760 of file tablefind.cpp.

760  {
762 }

◆ set_global_median_ledding()

void tesseract::TableFinder::set_global_median_ledding ( int  ledding)
protected

Definition at line 763 of file tablefind.cpp.

763  {
764  global_median_ledding_ = ledding;
765 }

◆ set_global_median_xheight()

void tesseract::TableFinder::set_global_median_xheight ( int  xheight)
protected

Definition at line 757 of file tablefind.cpp.

757  {
758  global_median_xheight_ = xheight;
759 }

◆ set_left_to_right_language()

void tesseract::TableFinder::set_left_to_right_language ( bool  order)

Definition at line 178 of file tablefind.cpp.

178  {
179  left_to_right_language_ = order;
180 }

◆ set_resolution()

void tesseract::TableFinder::set_resolution ( int  resolution)
inline

Definition at line 138 of file tablefind.h.

138  {
139  resolution_ = resolution;
140  }

◆ SetColumnsType()

void tesseract::TableFinder::SetColumnsType ( ColSegment_LIST *  col_segments)
protected

Definition at line 1144 of file tablefind.cpp.

1144  {
1145  ColSegment_IT it(column_blocks);
1146  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1147  ColSegment* seg = it.data();
1148  TBOX box = seg->bounding_box();
1149  int num_table_cells = 0;
1150  int num_text_cells = 0;
1151  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1152  rsearch(&clean_part_grid_);
1153  rsearch.SetUniqueMode(true);
1154  rsearch.StartRectSearch(box);
1155  ColPartition* part = nullptr;
1156  while ((part = rsearch.NextRectSearch()) != nullptr) {
1157  if (part->type() == PT_TABLE) {
1158  num_table_cells++;
1159  } else if (part->type() == PT_FLOWING_TEXT) {
1160  num_text_cells++;
1161  }
1162  }
1163  // If a column block has no text or table partition in it, it is not needed
1164  // for table detection.
1165  if (!num_table_cells && !num_text_cells) {
1166  delete it.extract();
1167  } else {
1168  seg->set_num_table_cells(num_table_cells);
1169  seg->set_num_text_cells(num_text_cells);
1170  // set column type based on the ratio of table to text cells
1171  seg->set_type();
1172  }
1173  }
1174 }
Definition: rect.h:34
Definition: capi.h:100
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ SetGlobalSpacings()

void tesseract::TableFinder::SetGlobalSpacings ( ColPartitionGrid grid)
protected

Definition at line 710 of file tablefind.cpp.

710  {
711  STATS xheight_stats(0, kMaxVerticalSpacing + 1);
712  STATS width_stats(0, kMaxBlobWidth + 1);
713  STATS ledding_stats(0, kMaxVerticalSpacing + 1);
714  // Iterate the ColPartitions in the grid.
715  ColPartitionGridSearch gsearch(grid);
716  gsearch.SetUniqueMode(true);
717  gsearch.StartFullSearch();
718  ColPartition* part = nullptr;
719  while ((part = gsearch.NextFullSearch()) != nullptr) {
720  // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
721  // ComputeLimits needs to get called somewhere outside of TableFinder
722  // to make sure the partitions are properly initialized.
723  // When this is called, SmoothPartitionPartners dies in an assert after
724  // table find runs. Alternative solution.
725  // part->ComputeLimits();
726  if (part->IsTextType()) {
727  // xheight_stats.add(part->median_height(), part->boxes_count());
728  // width_stats.add(part->median_width(), part->boxes_count());
729 
730  // This loop can be removed when above issues are fixed.
731  // Replace it with the 2 lines commented out above.
732  BLOBNBOX_C_IT it(part->boxes());
733  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
734  xheight_stats.add(it.data()->bounding_box().height(), 1);
735  width_stats.add(it.data()->bounding_box().width(), 1);
736  }
737 
738  ledding_stats.add(part->space_above(), 1);
739  ledding_stats.add(part->space_below(), 1);
740  }
741  }
742  // Set estimates based on median of statistics obtained
743  set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
744  set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
745  set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
746  #ifndef GRAPHICS_DISABLED
748  const char* kWindowName = "X-height (R), X-width (G), and ledding (B)";
749  ScrollView* stats_win = MakeWindow(500, 10, kWindowName);
750  xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
751  width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
752  ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
753  }
754  #endif // GRAPHICS_DISABLED
755 }
const int kMaxBlobWidth
Definition: tablefind.cpp:40
void set_global_median_xheight(int xheight)
Definition: tablefind.cpp:757
const int kMaxVerticalSpacing
Definition: tablefind.cpp:38
bool textord_tablefind_show_stats
Definition: tablefind.cpp:147
Definition: statistc.h:33
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:519
void set_global_median_ledding(int ledding)
Definition: tablefind.cpp:763
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
void set_global_median_blob_width(int width)
Definition: tablefind.cpp:760

◆ SetPartitionSpacings()

void tesseract::TableFinder::SetPartitionSpacings ( ColPartitionGrid grid,
ColPartitionSet **  all_columns 
)
staticprotected

Definition at line 587 of file tablefind.cpp.

588  {
589  // Iterate the ColPartitions in the grid.
590  ColPartitionGridSearch gsearch(grid);
591  gsearch.StartFullSearch();
592  ColPartition* part = nullptr;
593  while ((part = gsearch.NextFullSearch()) != nullptr) {
594  ColPartitionSet* columns = all_columns[gsearch.GridY()];
595  TBOX box = part->bounding_box();
596  int y = part->MidY();
597  ColPartition* left_column = columns->ColumnContaining(box.left(), y);
598  ColPartition* right_column = columns->ColumnContaining(box.right(), y);
599  // set distance from left column as space to the left
600  if (left_column) {
601  int left_space = std::max(0, box.left() - left_column->LeftAtY(y));
602  part->set_space_to_left(left_space);
603  }
604  // set distance from right column as space to the right
605  if (right_column) {
606  int right_space = std::max(0, right_column->RightAtY(y) - box.right());
607  part->set_space_to_right(right_space);
608  }
609 
610  // Look for images that may be closer.
611  // NOTE: used to be part_grid_, might cause issues now
612  ColPartitionGridSearch hsearch(grid);
613  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
614  ColPartition* neighbor = nullptr;
615  while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) {
616  if (neighbor->type() == PT_PULLOUT_IMAGE ||
617  neighbor->type() == PT_FLOWING_IMAGE ||
618  neighbor->type() == PT_HEADING_IMAGE) {
619  int right = neighbor->bounding_box().right();
620  if (right < box.left()) {
621  int space = std::min(box.left() - right, part->space_to_left());
622  part->set_space_to_left(space);
623  }
624  }
625  }
626  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
627  neighbor = nullptr;
628  while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) {
629  if (neighbor->type() == PT_PULLOUT_IMAGE ||
630  neighbor->type() == PT_FLOWING_IMAGE ||
631  neighbor->type() == PT_HEADING_IMAGE) {
632  int left = neighbor->bounding_box().left();
633  if (left > box.right()) {
634  int space = std::min(left - box.right(), part->space_to_right());
635  part->set_space_to_right(space);
636  }
637  }
638  }
639 
640  ColPartition* upper_part = part->SingletonPartner(true);
641  if (upper_part) {
642  int space = std::max(0, static_cast<int>(upper_part->bounding_box().bottom() -
643  part->bounding_box().bottom()));
644  part->set_space_above(space);
645  } else {
646  // TODO(nbeato): What constitutes a good value?
647  // 0 is the default value when not set, explicitly noting it needs to
648  // be something else.
649  part->set_space_above(INT32_MAX);
650  }
651 
652  ColPartition* lower_part = part->SingletonPartner(false);
653  if (lower_part) {
654  int space = std::max(0, static_cast<int>(part->bounding_box().bottom() -
655  lower_part->bounding_box().bottom()));
656  part->set_space_below(space);
657  } else {
658  // TODO(nbeato): What constitutes a good value?
659  // 0 is the default value when not set, explicitly noting it needs to
660  // be something else.
661  part->set_space_below(INT32_MAX);
662  }
663  }
664 }
Definition: rect.h:34
int16_t left() const
Definition: rect.h:72
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
int16_t top() const
Definition: rect.h:58
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65

◆ SetVerticalSpacing()

void tesseract::TableFinder::SetVerticalSpacing ( ColPartition part)
protected

Definition at line 667 of file tablefind.cpp.

667  {
668  TBOX box = part->bounding_box();
669  int top_range = std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y()));
670  int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing, static_cast<int>(bleft().y()));
671  box.set_top(top_range);
672  box.set_bottom(bottom_range);
673 
674  TBOX part_box = part->bounding_box();
675  // Start a rect search
676  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
677  rectsearch(&clean_part_grid_);
678  rectsearch.StartRectSearch(box);
679  ColPartition* neighbor;
680  int min_space_above = kMaxVerticalSpacing;
681  int min_space_below = kMaxVerticalSpacing;
682  ColPartition* above_neighbor = nullptr;
683  ColPartition* below_neighbor = nullptr;
684  while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
685  if (neighbor == part)
686  continue;
687  TBOX neighbor_box = neighbor->bounding_box();
688  if (neighbor_box.major_x_overlap(part_box)) {
689  int gap = abs(part->median_bottom() - neighbor->median_bottom());
690  // If neighbor is below current partition
691  if (neighbor_box.top() < part_box.bottom() &&
692  gap < min_space_below) {
693  min_space_below = gap;
694  below_neighbor = neighbor;
695  } // If neighbor is above current partition
696  else if (part_box.top() < neighbor_box.bottom() &&
697  gap < min_space_above) {
698  min_space_above = gap;
699  above_neighbor = neighbor;
700  }
701  }
702  }
703  part->set_space_above(min_space_above);
704  part->set_space_below(min_space_below);
705  part->set_nearest_neighbor_above(above_neighbor);
706  part->set_nearest_neighbor_below(below_neighbor);
707 }
const ICOORD & bleft() const
Definition: tablefind.cpp:388
void set_top(int y)
Definition: rect.h:61
void set_bottom(int y)
Definition: rect.h:68
int16_t y() const
access_function
Definition: points.h:57
const int kMaxVerticalSpacing
Definition: tablefind.cpp:38
Definition: rect.h:34
const ICOORD & tright() const
Definition: tablefind.cpp:391
int16_t top() const
Definition: rect.h:58
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
int16_t bottom() const
Definition: rect.h:65

◆ SmoothTablePartitionRuns()

void tesseract::TableFinder::SmoothTablePartitionRuns ( )
protected

Definition at line 1109 of file tablefind.cpp.

1109  {
1110  // Iterate the ColPartitions in the grid.
1112  gsearch.StartFullSearch();
1113  ColPartition* part = nullptr;
1114  while ((part = gsearch.NextFullSearch()) != nullptr) {
1115  if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN)
1116  continue; // Consider only text partitions
1117  ColPartition* upper_part = part->nearest_neighbor_above();
1118  ColPartition* lower_part = part->nearest_neighbor_below();
1119  if (!upper_part || !lower_part)
1120  continue;
1121  if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
1122  part->set_table_type();
1123  }
1124 
1125  // Pass 2, do the opposite. If both the upper and lower neighbors
1126  // exist and are not tables, this probably shouldn't be a table.
1127  gsearch.StartFullSearch();
1128  part = nullptr;
1129  while ((part = gsearch.NextFullSearch()) != nullptr) {
1130  if (part->type() != PT_TABLE)
1131  continue; // Consider only text partitions
1132  ColPartition* upper_part = part->nearest_neighbor_above();
1133  ColPartition* lower_part = part->nearest_neighbor_below();
1134 
1135  // table can't be by itself
1136  if ((upper_part && upper_part->type() != PT_TABLE) &&
1137  (lower_part && lower_part->type() != PT_TABLE)) {
1138  part->clear_table_type();
1139  }
1140  }
1141 }
Definition: capi.h:100
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

◆ SplitAndInsertFragmentedTextPartition()

void tesseract::TableFinder::SplitAndInsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 437 of file tablefind.cpp.

437  {
438  ASSERT_HOST(part != nullptr);
439  // Bye bye empty partitions!
440  if (part->boxes()->empty()) {
441  delete part;
442  return;
443  }
444 
445  // The AllowBlob function prevents this.
446  ASSERT_HOST(part->median_width() > 0);
447  const double kThreshold = part->median_width() * kSplitPartitionSize;
448 
449  ColPartition* right_part = part;
450  bool found_split = true;
451  while (found_split) {
452  found_split = false;
453  BLOBNBOX_C_IT box_it(right_part->boxes());
454  // Blobs are sorted left side first. If blobs overlap,
455  // the previous blob may have a "more right" right side.
456  // Account for this by always keeping the largest "right"
457  // so far.
458  int previous_right = INT32_MIN;
459 
460  // Look for the next split in the partition.
461  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
462  const TBOX& box = box_it.data()->bounding_box();
463  if (previous_right != INT32_MIN &&
464  box.left() - previous_right > kThreshold) {
465  // We have a split position. Split the partition in two pieces.
466  // Insert the left piece in the grid and keep processing the right.
467  int mid_x = (box.left() + previous_right) / 2;
468  ColPartition* left_part = right_part;
469  right_part = left_part->SplitAt(mid_x);
470 
472  found_split = true;
473  break;
474  }
475 
476  // The right side of the previous blobs.
477  previous_right = std::max(previous_right, static_cast<int>(box.right()));
478  }
479  }
480  // When a split is not found, the right part is minimized
481  // as much as possible, so process it.
482  InsertFragmentedTextPartition(right_part);
483 }
Definition: rect.h:34
int16_t left() const
Definition: rect.h:72
int16_t right() const
Definition: rect.h:79
void InsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:403
const double kSplitPartitionSize
Definition: tablefind.cpp:44
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ tright()

const ICOORD & tesseract::TableFinder::tright ( ) const
protected

Definition at line 391 of file tablefind.cpp.

391  {
392  return clean_part_grid_.tright();
393 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
const ICOORD & tright() const
Definition: bbgrid.h:76

Member Data Documentation

◆ clean_part_grid_

ColPartitionGrid tesseract::TableFinder::clean_part_grid_
protected

Definition at line 413 of file tablefind.h.

◆ col_seg_grid_

ColSegmentGrid tesseract::TableFinder::col_seg_grid_
protected

Definition at line 421 of file tablefind.h.

◆ fragmented_text_grid_

ColPartitionGrid tesseract::TableFinder::fragmented_text_grid_
protected

Definition at line 419 of file tablefind.h.

◆ global_median_blob_width_

int tesseract::TableFinder::global_median_blob_width_
protected

Definition at line 407 of file tablefind.h.

◆ global_median_ledding_

int tesseract::TableFinder::global_median_ledding_
protected

Definition at line 409 of file tablefind.h.

◆ global_median_xheight_

int tesseract::TableFinder::global_median_xheight_
protected

Definition at line 405 of file tablefind.h.

◆ leader_and_ruling_grid_

ColPartitionGrid tesseract::TableFinder::leader_and_ruling_grid_
protected

Definition at line 415 of file tablefind.h.

◆ left_to_right_language_

bool tesseract::TableFinder::left_to_right_language_
protected

Definition at line 425 of file tablefind.h.

◆ resolution_

int tesseract::TableFinder::resolution_
protected

Definition at line 403 of file tablefind.h.

◆ table_grid_

ColSegmentGrid tesseract::TableFinder::table_grid_
protected

Definition at line 423 of file tablefind.h.


The documentation for this class was generated from the following files: