tesseract  5.0.0-alpha-619-ge9db
tablefind.cpp
Go to the documentation of this file.
1 // File: tablefind.cpp
3 // Description: Helper classes to find tables from ColPartitions.
4 // Author: Faisal Shafait (faisal.shafait@dfki.de)
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifdef HAVE_CONFIG_H
20 #include "config_auto.h"
21 #endif
22 
23 #include "tablefind.h"
24 #include <algorithm>
25 #include <cmath>
26 
27 #include "allheaders.h"
28 
29 #include "colpartitionset.h"
30 #include "tablerecog.h"
31 
32 namespace tesseract {
33 
34 // These numbers are used to calculate the global median stats.
35 // They just set an upper bound on the stats objects.
36 // Maximum vertical spacing between neighbor partitions.
37 const int kMaxVerticalSpacing = 500;
38 // Maximum width of a blob in a partition.
39 const int kMaxBlobWidth = 500;
40 
41 // Minimum whitespace size to split a partition (measured as a multiple
42 // of a partition's median width).
43 const double kSplitPartitionSize = 2.0;
44 // To insert text, the partition must satisfy these size constraints
45 // in AllowTextPartition(). The idea is to filter noise partitions
46 // determined by the size compared to the global medians.
47 // TODO(nbeato): Need to find good numbers again.
48 const double kAllowTextHeight = 0.5;
49 const double kAllowTextWidth = 0.6;
50 const double kAllowTextArea = 0.8;
51 // The same thing applies to blobs (to filter noise).
52 // TODO(nbeato): These numbers are a shot in the dark...
53 // height and width are 0.5 * gridsize() in colfind.cpp
54 // area is a rough guess for the size of a period.
55 const double kAllowBlobHeight = 0.3;
56 const double kAllowBlobWidth = 0.4;
57 const double kAllowBlobArea = 0.05;
58 
59 // Minimum number of components in a text partition. A partition having fewer
60 // components than that is more likely a data partition and is a candidate
61 // table cell.
62 const int kMinBoxesInTextPartition = 10;
63 
64 // Maximum number of components that a data partition can have
65 const int kMaxBoxesInDataPartition = 20;
66 
67 // Maximum allowed gap in a text partitions as a multiple of its median size.
68 const double kMaxGapInTextPartition = 4.0;
69 
70 // Minimum value that the maximum gap in a text partition should have as a
71 // factor of its median size.
72 const double kMinMaxGapInTextPartition = 0.5;
73 
74 // The amount of overlap that is "normal" for adjacent blobs in a text
75 // partition. This is used to calculate gap between overlapping blobs.
76 const double kMaxBlobOverlapFactor = 4.0;
77 
78 // Maximum x-height a table partition can have as a multiple of global
79 // median x-height
80 const double kMaxTableCellXheight = 2.0;
81 
82 // Maximum line spacing between a table column header and column contents
83 // for merging the two (as a multiple of the partition's median_height).
85 
86 // Minimum ratio of num_table_partitions to num_text_partitions in a column
87 // block to be called it a table column
88 const double kTableColumnThreshold = 3.0;
89 
90 // Search for horizontal ruling lines within the vertical margin as a
91 // multiple of grid size
92 // const int kRulingVerticalMargin = 3;
93 
94 // Minimum overlap that a colpartition must have with a table region
95 // to become part of that table
96 const double kMinOverlapWithTable = 0.6;
97 
98 // Maximum side space (distance from column boundary) that a typical
99 // text-line in flowing text should have as a multiple of its x-height
100 // (Median size).
101 const int kSideSpaceMargin = 10;
102 
103 // Fraction of the peak of x-projection of a table region to set the
104 // threshold for the x-projection histogram
105 const double kSmallTableProjectionThreshold = 0.35;
106 const double kLargeTableProjectionThreshold = 0.45;
107 // Minimum number of rows required to look for more rows in the projection.
108 const int kLargeTableRowCount = 6;
109 
110 // Minimum number of rows in a table
111 const int kMinRowsInTable = 3;
112 
113 // The amount of padding (multiplied by global_median_xheight_ during use)
114 // that is vertically added to the search adjacent leader search during
115 // ColPartition marking.
117 
118 // Used when filtering false positives. When finding the last line
119 // of a paragraph (typically left-aligned), the previous line should have
120 // its center to the right of the last line by this scaled amount.
122 
123 // The maximum amount of whitespace allowed left of a paragraph ending.
124 // Do not filter a ColPartition with more than this space left of it.
126 
127 // Used when filtering false positives. The last line of a paragraph
128 // should be preceded by a line that is predominantly text. This is the
129 // ratio of text to whitespace (to the right of the text) that is required
130 // for the previous line to be a text.
132 
133 // When counting table columns, this is the required gap between two columns
134 // (it is multiplied by global_median_xheight_).
135 const double kMaxXProjectionGapFactor = 2.0;
136 
137 // Used for similarity in partitions using stroke width. Values copied
138 // from ColFind.cpp in Ray's CL.
140 const double kStrokeWidthConstantTolerance = 2.0;
141 
142 static BOOL_VAR(textord_show_tables, false, "Show table regions");
143 static BOOL_VAR(textord_tablefind_show_mark, false,
144  "Debug table marking steps in detail");
145 static BOOL_VAR(textord_tablefind_show_stats, false,
146  "Show page stats used in table finding");
147 static BOOL_VAR(textord_tablefind_recognize_tables, false,
148  "Enables the table recognizer for table layout and filtering.");
149 
152 
153 // Templated helper function used to create destructor callbacks for the
154 // BBGrid::ClearGridData() method.
155 template <typename T> void DeleteObject(T *object) {
156  delete object;
157 }
158 
160  : resolution_(0),
161  global_median_xheight_(0),
162  global_median_blob_width_(0),
163  global_median_ledding_(0),
164  left_to_right_language_(true) {
165 }
166 
168  // ColPartitions and ColSegments created by this class for storage in grids
169  // need to be deleted explicitly.
170  clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
171  leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
172  fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
173  col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
174  table_grid_.ClearGridData(&DeleteObject<ColSegment>);
175 }
176 
178  left_to_right_language_ = order;
179 }
180 
181 void TableFinder::Init(int grid_size, const ICOORD& bottom_left,
182  const ICOORD& top_right) {
183  // Initialize clean partitions list and grid
184  clean_part_grid_.Init(grid_size, bottom_left, top_right);
185  leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
186  fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
187  col_seg_grid_.Init(grid_size, bottom_left, top_right);
188  table_grid_.Init(grid_size, bottom_left, top_right);
189 }
190 
191 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
192 // insert leaders and rulers into the leader_and_ruling_grid_
194  TO_BLOCK* block) {
195  // Calculate stats. This lets us filter partitions in AllowTextPartition()
196  // and filter blobs in AllowBlob().
197  SetGlobalSpacings(grid);
198 
199  // Iterate the ColPartitions in the grid.
200  ColPartitionGridSearch gsearch(grid);
201  gsearch.SetUniqueMode(true);
202  gsearch.StartFullSearch();
203  ColPartition* part = nullptr;
204  while ((part = gsearch.NextFullSearch()) != nullptr) {
205  // Reject partitions with nothing useful inside of them.
206  if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0)
207  continue;
208  ColPartition* clean_part = part->ShallowCopy();
209  ColPartition* leader_part = nullptr;
210  if (part->IsLineType()) {
211  InsertRulingPartition(clean_part);
212  continue;
213  }
214  // Insert all non-text partitions to clean_parts
215  if (!part->IsTextType()) {
216  InsertImagePartition(clean_part);
217  continue;
218  }
219  // Insert text colpartitions after removing noisy components from them
220  // The leaders are split into a separate grid.
221  BLOBNBOX_CLIST* part_boxes = part->boxes();
222  BLOBNBOX_C_IT pit(part_boxes);
223  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
224  BLOBNBOX *pblob = pit.data();
225  // Bad blobs... happens in UNLV set.
226  // news.3G1, page 17 (around x=6)
227  if (!AllowBlob(*pblob))
228  continue;
229  if (pblob->flow() == BTFT_LEADER) {
230  if (leader_part == nullptr) {
231  leader_part = part->ShallowCopy();
232  leader_part->set_flow(BTFT_LEADER);
233  }
234  leader_part->AddBox(pblob);
235  } else if (pblob->region_type() != BRT_NOISE) {
236  clean_part->AddBox(pblob);
237  }
238  }
239  clean_part->ComputeLimits();
240  ColPartition* fragmented = clean_part->CopyButDontOwnBlobs();
241  InsertTextPartition(clean_part);
243  if (leader_part != nullptr) {
244  // TODO(nbeato): Note that ComputeLimits does not update the column
245  // information. So the leader may appear to span more columns than it
246  // really does later on when IsInSameColumnAs gets called to test
247  // for adjacent leaders.
248  leader_part->ComputeLimits();
249  InsertLeaderPartition(leader_part);
250  }
251  }
252 
253  // Make the partition partners better for upper and lower neighbors.
256 }
257 
258 // High level function to perform table detection
260  ColPartitionSet** all_columns,
261  WidthCallback width_cb,
262  const FCOORD& reskew) {
263  // initialize spacing, neighbors, and columns
264  InitializePartitions(all_columns);
265 
266 #ifndef GRAPHICS_DISABLED
267  if (textord_show_tables) {
268  ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
274 
275  table_win = MakeWindow(100, 300, "Fragmented Text");
277  }
278 #endif // GRAPHICS_DISABLED
279 
280  // mark, filter, and smooth candidate table partitions
282 
283  // Make single-column blocks from good_columns_ partitions. col_segments are
284  // moved to a grid later which takes the ownership
285  ColSegment_LIST column_blocks;
286  GetColumnBlocks(all_columns, &column_blocks);
287  // Set the ratio of candidate table partitions in each column
288  SetColumnsType(&column_blocks);
289 
290  // Move column segments to col_seg_grid_
291  MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
292 
293  // Detect split in column layout that might have occurred due to the
294  // presence of a table. In such a case, merge the corresponding columns.
296 
297  // Group horizontally overlapping table partitions into table columns.
298  // table_columns created here get deleted at the end of this method.
299  ColSegment_LIST table_columns;
300  GetTableColumns(&table_columns);
301 
302  // Within each column, mark the range table regions occupy based on the
303  // table columns detected. table_regions are moved to a grid later which
304  // takes the ownership
305  ColSegment_LIST table_regions;
306  GetTableRegions(&table_columns, &table_regions);
307 
308 #ifndef GRAPHICS_DISABLED
309  if (textord_tablefind_show_mark) {
310  ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions");
311  DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
312  DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
313  }
314 #endif // GRAPHICS_DISABLED
315 
316  // Merge table regions across columns for tables spanning multiple
317  // columns
318  MoveColSegmentsToGrid(&table_regions, &table_grid_);
320 
321  // Adjust table boundaries by including nearby horizontal lines and left
322  // out column headers
325 
326  if (textord_tablefind_recognize_tables) {
327  // Remove false alarms consisting of a single column
329 
330 #ifndef GRAPHICS_DISABLED
331  if (textord_show_tables) {
332  ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations");
334  DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
335  table_grid_.DisplayBoxes(table_win);
336  }
337 #endif // GRAPHICS_DISABLED
338 
339  // Find table grid structure and reject tables that are malformed.
340  RecognizeTables();
342  RecognizeTables();
343 
344 #ifndef GRAPHICS_DISABLED
345  if (textord_show_tables) {
346  ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables");
349  table_grid_.DisplayBoxes(table_win);
350  }
351 #endif // GRAPHICS_DISABLED
352  } else {
353  // Remove false alarms consisting of a single column
354  // TODO(nbeato): verify this is a NOP after structured table rejection.
355  // Right now it isn't. If the recognize function is doing what it is
356  // supposed to do, this function is obsolete.
358 
359 #ifndef GRAPHICS_DISABLED
360  if (textord_show_tables) {
361  ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
364  table_grid_.DisplayBoxes(table_win);
365  }
366 #endif // GRAPHICS_DISABLED
367  }
368 
369  // Merge all colpartitions in table regions to make them a single
370  // colpartition and revert types of isolated table cells not
371  // assigned to any table to their original types.
372  MakeTableBlocks(grid, all_columns, width_cb);
373 }
374 // All grids have the same dimensions. The clean_part_grid_ sizes are set from
375 // the part_grid_ that is passed to InsertCleanPartitions, which was the same as
376 // the grid that is the base of ColumnFinder. Just return the clean_part_grid_
377 // dimensions instead of duplicated memory.
379  return clean_part_grid_.gridsize();
380 }
382  return clean_part_grid_.gridwidth();
383 }
385  return clean_part_grid_.gridheight();
386 }
387 const ICOORD& TableFinder::bleft() const {
388  return clean_part_grid_.bleft();
389 }
390 const ICOORD& TableFinder::tright() const {
391  return clean_part_grid_.tright();
392 }
393 
395  ASSERT_HOST(part != nullptr);
396  if (AllowTextPartition(*part)) {
397  clean_part_grid_.InsertBBox(true, true, part);
398  } else {
399  delete part;
400  }
401 }
403  ASSERT_HOST(part != nullptr);
404  if (AllowTextPartition(*part)) {
405  fragmented_text_grid_.InsertBBox(true, true, part);
406  } else {
407  delete part;
408  }
409 }
411  ASSERT_HOST(part != nullptr);
412  if (!part->IsEmpty() && part->bounding_box().area() > 0) {
413  leader_and_ruling_grid_.InsertBBox(true, true, part);
414  } else {
415  delete part;
416  }
417 }
419  leader_and_ruling_grid_.InsertBBox(true, true, part);
420 }
422  // NOTE: If images are placed into a different grid in the future,
423  // the function SetPartitionSpacings needs to be updated. It should
424  // be the only thing that cares about image partitions.
425  clean_part_grid_.InsertBBox(true, true, part);
426 }
427 
428 // Splits a partition into its "words". The splits happen
429 // at locations with wide inter-blob spacing. This is useful
430 // because it allows the table recognize to "cut through" the
431 // text lines on the page. The assumption is that a table
432 // will have several lines with similar overlapping whitespace
433 // whereas text will not have this type of property.
434 // Note: The code Assumes that blobs are sorted by the left side x!
435 // This will not work (as well) if the blobs are sorted by center/right.
437  ASSERT_HOST(part != nullptr);
438  // Bye bye empty partitions!
439  if (part->boxes()->empty()) {
440  delete part;
441  return;
442  }
443 
444  // The AllowBlob function prevents this.
445  ASSERT_HOST(part->median_width() > 0);
446  const double kThreshold = part->median_width() * kSplitPartitionSize;
447 
448  ColPartition* right_part = part;
449  bool found_split = true;
450  while (found_split) {
451  found_split = false;
452  BLOBNBOX_C_IT box_it(right_part->boxes());
453  // Blobs are sorted left side first. If blobs overlap,
454  // the previous blob may have a "more right" right side.
455  // Account for this by always keeping the largest "right"
456  // so far.
457  int previous_right = INT32_MIN;
458 
459  // Look for the next split in the partition.
460  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
461  const TBOX& box = box_it.data()->bounding_box();
462  if (previous_right != INT32_MIN &&
463  box.left() - previous_right > kThreshold) {
464  // We have a split position. Split the partition in two pieces.
465  // Insert the left piece in the grid and keep processing the right.
466  int mid_x = (box.left() + previous_right) / 2;
467  ColPartition* left_part = right_part;
468  right_part = left_part->SplitAt(mid_x);
469 
471  found_split = true;
472  break;
473  }
474 
475  // The right side of the previous blobs.
476  previous_right = std::max(previous_right, static_cast<int>(box.right()));
477  }
478  }
479  // When a split is not found, the right part is minimized
480  // as much as possible, so process it.
481  InsertFragmentedTextPartition(right_part);
482 }
483 
484 // Some simple criteria to filter out now. We want to make sure the
485 // average blob size in the partition is consistent with the
486 // global page stats.
487 // The area metric will almost always pass for multi-blob partitions.
488 // It is useful when filtering out noise caused by an isolated blob.
490  const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
491  const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
492  const int median_area = global_median_xheight_ * global_median_blob_width_;
493  const double kAreaPerBlobRequired = median_area * kAllowTextArea;
494  // Keep comparisons strictly greater to disallow 0!
495  return part.median_height() > kHeightRequired &&
496  part.median_width() > kWidthRequired &&
497  part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
498 }
499 
500 // Same as above, applied to blobs. Keep in mind that
501 // leaders, commas, and periods are important in tables.
502 bool TableFinder::AllowBlob(const BLOBNBOX& blob) const {
503  const TBOX& box = blob.bounding_box();
504  const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
505  const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
506  const int median_area = global_median_xheight_ * global_median_blob_width_;
507  const double kAreaRequired = median_area * kAllowBlobArea;
508  // Keep comparisons strictly greater to disallow 0!
509  return box.height() > kHeightRequired &&
510  box.width() > kWidthRequired &&
511  box.area() > kAreaRequired;
512 }
513 
514 // TODO(nbeato): The grid that makes the window doesn't seem to matter.
515 // The only downside is that window messages will be caught by
516 // clean_part_grid_ instead of a useful object. This is a temporary solution
517 // for the debug windows created by the TableFinder.
518 ScrollView* TableFinder::MakeWindow(int x, int y, const char* window_name) {
519  return clean_part_grid_.MakeWindow(x, y, window_name);
520 }
521 
522 // Make single-column blocks from good_columns_ partitions.
524  ColSegment_LIST* column_blocks) {
525  for (int i = 0; i < gridheight(); ++i) {
526  ColPartitionSet* columns = all_columns[i];
527  if (columns != nullptr) {
528  ColSegment_LIST new_blocks;
529  // Get boxes from the current vertical position on the grid
530  columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks);
531  // Merge the new_blocks boxes into column_blocks if they are well-aligned
532  GroupColumnBlocks(&new_blocks, column_blocks);
533  }
534  }
535 }
536 
537 // Merge column segments into the current list if they are well aligned.
538 void TableFinder::GroupColumnBlocks(ColSegment_LIST* new_blocks,
539  ColSegment_LIST* column_blocks) {
540  ColSegment_IT src_it(new_blocks);
541  ColSegment_IT dest_it(column_blocks);
542  // iterate through the source list
543  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
544  ColSegment* src_seg = src_it.data();
545  const TBOX& src_box = src_seg->bounding_box();
546  bool match_found = false;
547  // iterate through the destination list to find a matching column block
548  for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
549  ColSegment* dest_seg = dest_it.data();
550  TBOX dest_box = dest_seg->bounding_box();
551  if (ConsecutiveBoxes(src_box, dest_box)) {
552  // If matching block is found, insert the current block into it
553  // and delete the source block.
554  dest_seg->InsertBox(src_box);
555  match_found = true;
556  delete src_it.extract();
557  break;
558  }
559  }
560  // If no match is found, just append the source block to column_blocks
561  if (!match_found) {
562  dest_it.add_after_then_move(src_it.extract());
563  }
564  }
565 }
566 
567 // are the two boxes immediate neighbors along the vertical direction
568 bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
569  int x_margin = 20;
570  int y_margin = 5;
571  return (abs(b1.left() - b2.left()) < x_margin) &&
572  (abs(b1.right() - b2.right()) < x_margin) &&
573  (abs(b1.top()-b2.bottom()) < y_margin ||
574  abs(b2.top()-b1.bottom()) < y_margin);
575 }
576 
577 // Set up info for clean_part_grid_ partitions to be valid during detection
578 // code.
580  FindNeighbors();
581  SetPartitionSpacings(&clean_part_grid_, all_columns);
583 }
584 
585 // Set left, right and top, bottom spacings of each colpartition.
587  ColPartitionSet** all_columns) {
588  // Iterate the ColPartitions in the grid.
589  ColPartitionGridSearch gsearch(grid);
590  gsearch.StartFullSearch();
591  ColPartition* part = nullptr;
592  while ((part = gsearch.NextFullSearch()) != nullptr) {
593  ColPartitionSet* columns = all_columns[gsearch.GridY()];
594  TBOX box = part->bounding_box();
595  int y = part->MidY();
596  ColPartition* left_column = columns->ColumnContaining(box.left(), y);
597  ColPartition* right_column = columns->ColumnContaining(box.right(), y);
598  // set distance from left column as space to the left
599  if (left_column) {
600  int left_space = std::max(0, box.left() - left_column->LeftAtY(y));
601  part->set_space_to_left(left_space);
602  }
603  // set distance from right column as space to the right
604  if (right_column) {
605  int right_space = std::max(0, right_column->RightAtY(y) - box.right());
606  part->set_space_to_right(right_space);
607  }
608 
609  // Look for images that may be closer.
610  // NOTE: used to be part_grid_, might cause issues now
611  ColPartitionGridSearch hsearch(grid);
612  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
613  ColPartition* neighbor = nullptr;
614  while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) {
615  if (neighbor->type() == PT_PULLOUT_IMAGE ||
616  neighbor->type() == PT_FLOWING_IMAGE ||
617  neighbor->type() == PT_HEADING_IMAGE) {
618  int right = neighbor->bounding_box().right();
619  if (right < box.left()) {
620  int space = std::min(box.left() - right, part->space_to_left());
621  part->set_space_to_left(space);
622  }
623  }
624  }
625  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
626  neighbor = nullptr;
627  while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) {
628  if (neighbor->type() == PT_PULLOUT_IMAGE ||
629  neighbor->type() == PT_FLOWING_IMAGE ||
630  neighbor->type() == PT_HEADING_IMAGE) {
631  int left = neighbor->bounding_box().left();
632  if (left > box.right()) {
633  int space = std::min(left - box.right(), part->space_to_right());
634  part->set_space_to_right(space);
635  }
636  }
637  }
638 
639  ColPartition* upper_part = part->SingletonPartner(true);
640  if (upper_part) {
641  int space = std::max(0, static_cast<int>(upper_part->bounding_box().bottom() -
642  part->bounding_box().bottom()));
643  part->set_space_above(space);
644  } else {
645  // TODO(nbeato): What constitutes a good value?
646  // 0 is the default value when not set, explicitly noting it needs to
647  // be something else.
648  part->set_space_above(INT32_MAX);
649  }
650 
651  ColPartition* lower_part = part->SingletonPartner(false);
652  if (lower_part) {
653  int space = std::max(0, static_cast<int>(part->bounding_box().bottom() -
654  lower_part->bounding_box().bottom()));
655  part->set_space_below(space);
656  } else {
657  // TODO(nbeato): What constitutes a good value?
658  // 0 is the default value when not set, explicitly noting it needs to
659  // be something else.
660  part->set_space_below(INT32_MAX);
661  }
662  }
663 }
664 
665 // Set spacing and closest neighbors above and below a given colpartition.
667  TBOX box = part->bounding_box();
668  int top_range = std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y()));
669  int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing, static_cast<int>(bleft().y()));
670  box.set_top(top_range);
671  box.set_bottom(bottom_range);
672 
673  TBOX part_box = part->bounding_box();
674  // Start a rect search
676  rectsearch(&clean_part_grid_);
677  rectsearch.StartRectSearch(box);
678  ColPartition* neighbor;
679  int min_space_above = kMaxVerticalSpacing;
680  int min_space_below = kMaxVerticalSpacing;
681  ColPartition* above_neighbor = nullptr;
682  ColPartition* below_neighbor = nullptr;
683  while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
684  if (neighbor == part)
685  continue;
686  TBOX neighbor_box = neighbor->bounding_box();
687  if (neighbor_box.major_x_overlap(part_box)) {
688  int gap = abs(part->median_bottom() - neighbor->median_bottom());
689  // If neighbor is below current partition
690  if (neighbor_box.top() < part_box.bottom() &&
691  gap < min_space_below) {
692  min_space_below = gap;
693  below_neighbor = neighbor;
694  } // If neighbor is above current partition
695  else if (part_box.top() < neighbor_box.bottom() &&
696  gap < min_space_above) {
697  min_space_above = gap;
698  above_neighbor = neighbor;
699  }
700  }
701  }
702  part->set_space_above(min_space_above);
703  part->set_space_below(min_space_below);
704  part->set_nearest_neighbor_above(above_neighbor);
705  part->set_nearest_neighbor_below(below_neighbor);
706 }
707 
708 // Set global spacing and x-height estimates
710  STATS xheight_stats(0, kMaxVerticalSpacing + 1);
711  STATS width_stats(0, kMaxBlobWidth + 1);
712  STATS ledding_stats(0, kMaxVerticalSpacing + 1);
713  // Iterate the ColPartitions in the grid.
714  ColPartitionGridSearch gsearch(grid);
715  gsearch.SetUniqueMode(true);
716  gsearch.StartFullSearch();
717  ColPartition* part = nullptr;
718  while ((part = gsearch.NextFullSearch()) != nullptr) {
719  // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
720  // ComputeLimits needs to get called somewhere outside of TableFinder
721  // to make sure the partitions are properly initialized.
722  // When this is called, SmoothPartitionPartners dies in an assert after
723  // table find runs. Alternative solution.
724  // part->ComputeLimits();
725  if (part->IsTextType()) {
726  // xheight_stats.add(part->median_height(), part->boxes_count());
727  // width_stats.add(part->median_width(), part->boxes_count());
728 
729  // This loop can be removed when above issues are fixed.
730  // Replace it with the 2 lines commented out above.
731  BLOBNBOX_C_IT it(part->boxes());
732  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
733  xheight_stats.add(it.data()->bounding_box().height(), 1);
734  width_stats.add(it.data()->bounding_box().width(), 1);
735  }
736 
737  ledding_stats.add(part->space_above(), 1);
738  ledding_stats.add(part->space_below(), 1);
739  }
740  }
741  // Set estimates based on median of statistics obtained
742  set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
743  set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
744  set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
745  #ifndef GRAPHICS_DISABLED
746  if (textord_tablefind_show_stats) {
747  const char* kWindowName = "X-height (R), X-width (G), and ledding (B)";
748  ScrollView* stats_win = MakeWindow(500, 10, kWindowName);
749  xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
750  width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
751  ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
752  }
753  #endif // GRAPHICS_DISABLED
754 }
755 
757  global_median_xheight_ = xheight;
758 }
761 }
763  global_median_ledding_ = ledding;
764 }
765 
768  gsearch.StartFullSearch();
769  ColPartition* part = nullptr;
770  while ((part = gsearch.NextFullSearch()) != nullptr) {
771  // TODO(nbeato): Rename this function, meaning is different now.
772  // IT is finding nearest neighbors its own way
773  //SetVerticalSpacing(part);
774 
775  ColPartition* upper = part->SingletonPartner(true);
776  if (upper)
777  part->set_nearest_neighbor_above(upper);
778 
779  ColPartition* lower = part->SingletonPartner(false);
780  if (lower)
781  part->set_nearest_neighbor_below(lower);
782  }
783 }
784 
785 // High level interface. Input is an unmarked ColPartitionGrid
786 // (namely, clean_part_grid_). Partitions are identified using local
787 // information and filter/smoothed. The function exit should contain
788 // a good sampling of the table partitions.
791  if (textord_tablefind_show_mark) {
792  ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions");
796  }
798  if (textord_tablefind_show_mark) {
799  ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions");
803  }
805  if (textord_tablefind_show_mark) {
806  ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
810  }
812  if (textord_tablefind_show_mark || textord_show_tables) {
813  ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions");
817  }
818 }
819 
820 // These types of partitions are marked as table partitions:
821 // 1- Partitions that have at lease one large gap between words
822 // 2- Partitions that consist of only one word (no significant gap
823 // between components)
824 // 3- Partitions that vertically overlap with other partitions within the
825 // same column.
826 // 4- Partitions with leaders before/after them.
828  // Iterate the ColPartitions in the grid.
830  gsearch(&clean_part_grid_);
831  gsearch.StartFullSearch();
832  ColPartition* part = nullptr;
833  while ((part = gsearch.NextFullSearch()) != nullptr) {
834  if (!part->IsTextType()) // Only consider text partitions
835  continue;
836  // Only consider partitions in dominant font size or smaller
838  continue;
839  // Mark partitions with a large gap, or no significant gap as
840  // table partitions.
841  // Comments: It produces several false alarms at:
842  // - last line of a paragraph (fixed)
843  // - single word section headings
844  // - page headers and footers
845  // - numbered equations
846  // - line drawing regions
847  // TODO(faisal): detect and fix above-mentioned cases
848  if (HasWideOrNoInterWordGap(part) ||
849  HasLeaderAdjacent(*part)) {
850  part->set_table_type();
851  }
852  }
853 }
854 
855 // Check if the partition has at least one large gap between words or no
856 // significant gap at all
858  // Should only get text partitions.
859  ASSERT_HOST(part->IsTextType());
860  // Blob access
861  BLOBNBOX_CLIST* part_boxes = part->boxes();
862  BLOBNBOX_C_IT it(part_boxes);
863  // Check if this is a relatively small partition (such as a single word)
864  if (part->bounding_box().width() <
866  part_boxes->length() < kMinBoxesInTextPartition)
867  return true;
868 
869  // Variables used to compute inter-blob spacing.
870  int current_x0 = -1;
871  int current_x1 = -1;
872  int previous_x1 = -1;
873  // Stores the maximum gap detected.
874  int largest_partition_gap_found = -1;
875  // Text partition gap limits. If this is text (and not a table),
876  // there should be at least one gap larger than min_gap and no gap
877  // larger than max_gap.
878  const double max_gap = kMaxGapInTextPartition * part->median_height();
879  const double min_gap = kMinMaxGapInTextPartition * part->median_height();
880 
881  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
882  BLOBNBOX* blob = it.data();
883  current_x0 = blob->bounding_box().left();
884  current_x1 = blob->bounding_box().right();
885  if (previous_x1 != -1) {
886  int gap = current_x0 - previous_x1;
887 
888  // TODO(nbeato): Boxes may overlap? Huh?
889  // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
890  // on the top right of the page are filtered out with this line.
891  // Note 2: Iterating over blobs in a partition, so we are looking for
892  // spacing between the words.
893  if (gap < 0) {
894  // More likely case, the blobs slightly overlap. This can happen
895  // with diacritics (accents) or broken alphabet symbols (characters).
896  // Merge boxes together by taking max of right sides.
897  if (-gap < part->median_height() * kMaxBlobOverlapFactor) {
898  previous_x1 = std::max(previous_x1, current_x1);
899  continue;
900  }
901  // Extreme case, blobs overlap significantly in the same partition...
902  // This should not happen often (if at all), but it does.
903  // TODO(nbeato): investigate cases when this happens.
904  else {
905  // The behavior before was to completely ignore this case.
906  }
907  }
908 
909  // If a large enough gap is found, mark it as a table cell (return true)
910  if (gap > max_gap)
911  return true;
912  if (gap > largest_partition_gap_found)
913  largest_partition_gap_found = gap;
914  }
915  previous_x1 = current_x1;
916  }
917  // Since no large gap was found, return false if the partition is too
918  // long to be a data cell
919  if (part->bounding_box().width() >
921  part_boxes->length() > kMaxBoxesInDataPartition)
922  return false;
923 
924  // A partition may be a single blob. In this case, it's an isolated symbol
925  // or non-text (such as a ruling or image).
926  // Detect these as table partitions? Shouldn't this be case by case?
927  // The behavior before was to ignore this, making max_partition_gap < 0
928  // and implicitly return true. Just making it explicit.
929  if (largest_partition_gap_found == -1)
930  return true;
931 
932  // return true if the maximum gap found is smaller than the minimum allowed
933  // max_gap in a text partition. This indicates that there is no significant
934  // space in the partition, hence it is likely a single word.
935  return largest_partition_gap_found < min_gap;
936 }
937 
938 // A criteria for possible tables is that a table may have leaders
939 // between data cells. An aggressive solution to find such tables is to
940 // explicitly mark partitions that have adjacent leaders.
941 // Note that this includes overlapping leaders. However, it does not
942 // include leaders in different columns on the page.
943 // Possible false-positive will include lists, such as a table of contents.
944 // As these arise, the aggressive nature of this search may need to be
945 // trimmed down.
947  if (part.flow() == BTFT_LEADER)
948  return true;
949  // Search range is left and right bounded by an offset of the
950  // median xheight. This offset is to allow some tolerance to the
951  // the leaders on the page in the event that the alignment is still
952  // a bit off.
953  const TBOX& box = part.bounding_box();
954  const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
955  const int top = box.top() + search_size;
956  const int bottom = box.bottom() - search_size;
958  for (int direction = 0; direction < 2; ++direction) {
959  bool right_to_left = (direction == 0);
960  int x = right_to_left ? box.right() : box.left();
961  hsearch.StartSideSearch(x, bottom, top);
962  ColPartition* leader = nullptr;
963  while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) {
964  // The leader could be a horizontal ruling in the grid.
965  // Make sure it is actually a leader.
966  if (leader->flow() != BTFT_LEADER)
967  continue;
968  // This should not happen, they are in different grids.
969  ASSERT_HOST(&part != leader);
970  // Make sure the leader shares a page column with the partition,
971  // otherwise we are spreading across columns.
972  if (!part.IsInSameColumnAs(*leader))
973  break;
974  // There should be a significant vertical overlap
975  if (!leader->VSignificantCoreOverlap(part))
976  continue;
977  // Leader passed all tests, so it is adjacent.
978  return true;
979  }
980  }
981  // No leaders are adjacent to the given partition.
982  return false;
983 }
984 
985 // Filter individual text partitions marked as table partitions
986 // consisting of paragraph endings, small section headings, and
987 // headers and footers.
991  // TODO(nbeato): Fully justified text as non-table?
992 }
993 
995  // Detect last line of paragraph
996  // Iterate the ColPartitions in the grid.
998  gsearch.StartFullSearch();
999  ColPartition* part = nullptr;
1000  while ((part = gsearch.NextFullSearch()) != nullptr) {
1001  if (part->type() != PT_TABLE)
1002  continue; // Consider only table partitions
1003 
1004  // Paragraph ending should have flowing text above it.
1005  ColPartition* upper_part = part->nearest_neighbor_above();
1006  if (!upper_part)
1007  continue;
1008  if (upper_part->type() != PT_FLOWING_TEXT)
1009  continue;
1010  if (upper_part->bounding_box().width() <
1011  2 * part->bounding_box().width())
1012  continue;
1013  // Check if its the last line of a paragraph.
1014  // In most cases, a paragraph ending should be left-aligned to text line
1015  // above it. Sometimes, it could be a 2 line paragraph, in which case
1016  // the line above it is indented.
1017  // To account for that, check if the partition center is to
1018  // the left of the one above it.
1019  int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1020  int upper_mid = (upper_part->bounding_box().left() +
1021  upper_part->bounding_box().right()) / 2;
1022  int current_spacing = 0; // spacing of the current line to margin
1023  int upper_spacing = 0; // spacing of the previous line to the margin
1025  // Left to right languages, use mid - left to figure out the distance
1026  // the middle is from the left margin.
1027  int left = std::min(part->bounding_box().left(),
1028  upper_part->bounding_box().left());
1029  current_spacing = mid - left;
1030  upper_spacing = upper_mid - left;
1031  } else {
1032  // Right to left languages, use right - mid to figure out the distance
1033  // the middle is from the right margin.
1034  int right = std::max(part->bounding_box().right(),
1035  upper_part->bounding_box().right());
1036  current_spacing = right - mid;
1037  upper_spacing = right - upper_mid;
1038  }
1039  if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing)
1040  continue;
1041 
1042  // Paragraphs should have similar fonts.
1043  if (!part->MatchingSizes(*upper_part) ||
1046  continue;
1047  }
1048 
1049  // The last line of a paragraph should be left aligned.
1050  // TODO(nbeato): This would be untrue if the text was right aligned.
1051  // How often is that?
1052  if (part->space_to_left() >
1054  continue;
1055  // The line above it should be right aligned (assuming justified format).
1056  // Since we can't assume justified text, we compare whitespace to text.
1057  // The above line should have majority spanning text (or the current
1058  // line could have fit on the previous line). So compare
1059  // whitespace to text.
1060  if (upper_part->bounding_box().width() <
1062  continue;
1063 
1064  // Ledding above the line should be less than ledding below
1065  if (part->space_above() >= part->space_below() ||
1066  part->space_above() > 2 * global_median_ledding_)
1067  continue;
1068 
1069  // If all checks failed, it is probably text.
1070  part->clear_table_type();
1071  }
1072 }
1073 
1075  // Consider top-most text colpartition as header and bottom most as footer
1076  ColPartition* header = nullptr;
1077  ColPartition* footer = nullptr;
1078  int max_top = INT32_MIN;
1079  int min_bottom = INT32_MAX;
1081  gsearch.StartFullSearch();
1082  ColPartition* part = nullptr;
1083  while ((part = gsearch.NextFullSearch()) != nullptr) {
1084  if (!part->IsTextType())
1085  continue; // Consider only text partitions
1086  int top = part->bounding_box().top();
1087  int bottom = part->bounding_box().bottom();
1088  if (top > max_top) {
1089  max_top = top;
1090  header = part;
1091  }
1092  if (bottom < min_bottom) {
1093  min_bottom = bottom;
1094  footer = part;
1095  }
1096  }
1097  if (header)
1098  header->clear_table_type();
1099  if (footer)
1100  footer->clear_table_type();
1101 }
1102 
1103 // Mark all ColPartitions as table cells that have a table cell above
1104 // and below them
1105 // TODO(faisal): This is too aggressive at the moment. The method needs to
1106 // consider spacing and alignment as well. Detection of false alarm table cells
1107 // should also be done as part of it.
1109  // Iterate the ColPartitions in the grid.
1111  gsearch.StartFullSearch();
1112  ColPartition* part = nullptr;
1113  while ((part = gsearch.NextFullSearch()) != nullptr) {
1114  if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN)
1115  continue; // Consider only text partitions
1116  ColPartition* upper_part = part->nearest_neighbor_above();
1117  ColPartition* lower_part = part->nearest_neighbor_below();
1118  if (!upper_part || !lower_part)
1119  continue;
1120  if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
1121  part->set_table_type();
1122  }
1123 
1124  // Pass 2, do the opposite. If both the upper and lower neighbors
1125  // exist and are not tables, this probably shouldn't be a table.
1126  gsearch.StartFullSearch();
1127  part = nullptr;
1128  while ((part = gsearch.NextFullSearch()) != nullptr) {
1129  if (part->type() != PT_TABLE)
1130  continue; // Consider only text partitions
1131  ColPartition* upper_part = part->nearest_neighbor_above();
1132  ColPartition* lower_part = part->nearest_neighbor_below();
1133 
1134  // table can't be by itself
1135  if ((upper_part && upper_part->type() != PT_TABLE) &&
1136  (lower_part && lower_part->type() != PT_TABLE)) {
1137  part->clear_table_type();
1138  }
1139  }
1140 }
1141 
1142 // Set the type of a column segment based on the ratio of table to text cells
1143 void TableFinder::SetColumnsType(ColSegment_LIST* column_blocks) {
1144  ColSegment_IT it(column_blocks);
1145  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1146  ColSegment* seg = it.data();
1147  TBOX box = seg->bounding_box();
1148  int num_table_cells = 0;
1149  int num_text_cells = 0;
1151  rsearch(&clean_part_grid_);
1152  rsearch.SetUniqueMode(true);
1153  rsearch.StartRectSearch(box);
1154  ColPartition* part = nullptr;
1155  while ((part = rsearch.NextRectSearch()) != nullptr) {
1156  if (part->type() == PT_TABLE) {
1157  num_table_cells++;
1158  } else if (part->type() == PT_FLOWING_TEXT) {
1159  num_text_cells++;
1160  }
1161  }
1162  // If a column block has no text or table partition in it, it is not needed
1163  // for table detection.
1164  if (!num_table_cells && !num_text_cells) {
1165  delete it.extract();
1166  } else {
1167  seg->set_num_table_cells(num_table_cells);
1168  seg->set_num_text_cells(num_text_cells);
1169  // set column type based on the ratio of table to text cells
1170  seg->set_type();
1171  }
1172  }
1173 }
1174 
1175 // Move column blocks to grid
1176 void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,
1177  ColSegmentGrid *col_seg_grid) {
1178  ColSegment_IT it(segments);
1179  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1180  ColSegment* seg = it.extract();
1181  col_seg_grid->InsertBBox(true, true, seg);
1182  }
1183 }
1184 
1185 // Merge column blocks if a split is detected due to the presence of a
1186 // table. A text block is considered split if it has multiple
1187 // neighboring blocks above/below it, and at least one of the
1188 // neighboring blocks is of table type (has a high density of table
1189 // partitions). In this case neighboring blocks in the direction
1190 // (above/below) of the table block are merged with the text block.
1191 
1192 // Comment: This method does not handle split due to a full page table
1193 // since table columns in this case do not have a text column on which
1194 // split decision can be based.
1196  int margin = gridsize();
1197 
1198  // Iterate the Column Blocks in the grid.
1200  gsearch(&col_seg_grid_);
1201  gsearch.StartFullSearch();
1202  ColSegment* seg;
1203  while ((seg = gsearch.NextFullSearch()) != nullptr) {
1204  if (seg->type() != COL_TEXT)
1205  continue; // only consider text blocks for split detection
1206  bool neighbor_found = false;
1207  bool modified = false; // Modified at least once
1208  // keep expanding current box as long as neighboring table columns
1209  // are found above or below it.
1210  do {
1211  TBOX box = seg->bounding_box();
1212  // slightly expand the search region vertically
1213  int top_range = std::min(box.top() + margin, static_cast<int>(tright().y()));
1214  int bottom_range = std::max(box.bottom() - margin, static_cast<int>(bleft().y()));
1215  box.set_top(top_range);
1216  box.set_bottom(bottom_range);
1217  neighbor_found = false;
1219  rectsearch(&col_seg_grid_);
1220  rectsearch.StartRectSearch(box);
1221  ColSegment* neighbor = nullptr;
1222  while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1223  if (neighbor == seg)
1224  continue;
1225  const TBOX& neighbor_box = neighbor->bounding_box();
1226  // If the neighbor box significantly overlaps with the current
1227  // box (due to the expansion of the current box in the
1228  // previous iteration of this loop), remove the neighbor box
1229  // and expand the current box to include it.
1230  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1231  seg->InsertBox(neighbor_box);
1232  modified = true;
1233  rectsearch.RemoveBBox();
1234  gsearch.RepositionIterator();
1235  delete neighbor;
1236  continue;
1237  }
1238  // Only expand if the neighbor box is of table type
1239  if (neighbor->type() != COL_TABLE)
1240  continue;
1241  // Insert the neighbor box into the current column block
1242  if (neighbor_box.major_x_overlap(box) &&
1243  !box.contains(neighbor_box)) {
1244  seg->InsertBox(neighbor_box);
1245  neighbor_found = true;
1246  modified = true;
1247  rectsearch.RemoveBBox();
1248  gsearch.RepositionIterator();
1249  delete neighbor;
1250  }
1251  }
1252  } while (neighbor_found);
1253  if (modified) {
1254  // Because the box has changed, it has to be removed first.
1255  gsearch.RemoveBBox();
1256  col_seg_grid_.InsertBBox(true, true, seg);
1257  gsearch.RepositionIterator();
1258  }
1259  }
1260 }
1261 
1262 // Group horizontally overlapping table partitions into table columns.
1263 // TODO(faisal): This is too aggressive at the moment. The method should
1264 // consider more attributes to group table partitions together. Some common
1265 // errors are:
1266 // 1- page number is merged with a table column above it even
1267 // if there is a large vertical gap between them.
1268 // 2- column headers go on to catch one of the columns arbitrarily
1269 // 3- an isolated noise blob near page top or bottom merges with the table
1270 // column below/above it
1271 // 4- cells from two vertically adjacent tables merge together to make a
1272 // single column resulting in merging of the two tables
1273 void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) {
1274  ColSegment_IT it(table_columns);
1275  // Iterate the ColPartitions in the grid.
1277  gsearch(&clean_part_grid_);
1278  gsearch.StartFullSearch();
1279  ColPartition* part;
1280  while ((part = gsearch.NextFullSearch()) != nullptr) {
1281  if (part->inside_table_column() || part->type() != PT_TABLE)
1282  continue; // prevent a partition to be assigned to multiple columns
1283  const TBOX& box = part->bounding_box();
1284  auto* col = new ColSegment();
1285  col->InsertBox(box);
1286  part->set_inside_table_column(true);
1287  // Start a search below the current cell to find bottom neighbours
1288  // Note: a full search will always process things above it first, so
1289  // this should be starting at the highest cell and working its way down.
1291  vsearch(&clean_part_grid_);
1292  vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1293  ColPartition* neighbor = nullptr;
1294  bool found_neighbours = false;
1295  while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) {
1296  // only consider neighbors not assigned to any column yet
1297  if (neighbor->inside_table_column())
1298  continue;
1299  // Horizontal lines should not break the flow
1300  if (neighbor->IsHorizontalLine())
1301  continue;
1302  // presence of a non-table neighbor marks the end of current
1303  // table column
1304  if (neighbor->type() != PT_TABLE)
1305  break;
1306  // add the neighbor partition to the table column
1307  const TBOX& neighbor_box = neighbor->bounding_box();
1308  col->InsertBox(neighbor_box);
1309  neighbor->set_inside_table_column(true);
1310  found_neighbours = true;
1311  }
1312  if (found_neighbours) {
1313  it.add_after_then_move(col);
1314  } else {
1315  part->set_inside_table_column(false);
1316  delete col;
1317  }
1318  }
1319 }
1320 
1321 // Mark regions in a column that are x-bounded by the column boundaries and
1322 // y-bounded by the table columns' projection on the y-axis as table regions
1323 void TableFinder::GetTableRegions(ColSegment_LIST* table_columns,
1324  ColSegment_LIST* table_regions) {
1325  ColSegment_IT cit(table_columns);
1326  ColSegment_IT rit(table_regions);
1327  // Iterate through column blocks
1329  gsearch(&col_seg_grid_);
1330  gsearch.StartFullSearch();
1331  ColSegment* part;
1332  int page_height = tright().y() - bleft().y();
1333  ASSERT_HOST(page_height > 0);
1334  // create a bool array to hold projection on y-axis
1335  bool* table_region = new bool[page_height];
1336  while ((part = gsearch.NextFullSearch()) != nullptr) {
1337  const TBOX& part_box = part->bounding_box();
1338  // reset the projection array
1339  for (int i = 0; i < page_height; i++) {
1340  table_region[i] = false;
1341  }
1342  // iterate through all table columns to find regions in the current
1343  // page column block
1344  cit.move_to_first();
1345  for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1346  TBOX col_box = cit.data()->bounding_box();
1347  // find intersection region of table column and page column
1348  TBOX intersection_box = col_box.intersection(part_box);
1349  // project table column on the y-axis
1350  for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1351  table_region[i - bleft().y()] = true;
1352  }
1353  }
1354  // set x-limits of table regions to page column width
1355  TBOX current_table_box;
1356  current_table_box.set_left(part_box.left());
1357  current_table_box.set_right(part_box.right());
1358  // go through the y-axis projection to find runs of table
1359  // regions. Each run makes one table region.
1360  for (int i = 1; i < page_height; i++) {
1361  // detect start of a table region
1362  if (!table_region[i - 1] && table_region[i]) {
1363  current_table_box.set_bottom(i + bleft().y());
1364  }
1365  // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1366  // detect end of a table region
1367  if (table_region[i - 1] && !table_region[i]) {
1368  current_table_box.set_top(i + bleft().y());
1369  if (!current_table_box.null_box()) {
1370  auto* seg = new ColSegment();
1371  seg->InsertBox(current_table_box);
1372  rit.add_after_then_move(seg);
1373  }
1374  }
1375  }
1376  }
1377  delete[] table_region;
1378 }
1379 
1380 // Merge table regions corresponding to tables spanning multiple columns if
1381 // there is a colpartition (horizontal ruling line or normal text) that
1382 // touches both regions.
1383 // TODO(faisal): A rare error occurs if there are two horizontally adjacent
1384 // tables with aligned ruling lines. In this case, line finder returns a
1385 // single line and hence the tables get merged together
1387  // Iterate the table regions in the grid.
1389  gsearch(&table_grid_);
1390  gsearch.StartFullSearch();
1391  ColSegment* seg = nullptr;
1392  while ((seg = gsearch.NextFullSearch()) != nullptr) {
1393  bool neighbor_found = false;
1394  bool modified = false; // Modified at least once
1395  do {
1396  // Start a rectangle search x-bounded by the image and y by the table
1397  const TBOX& box = seg->bounding_box();
1398  TBOX search_region(box);
1399  search_region.set_left(bleft().x());
1400  search_region.set_right(tright().x());
1401  neighbor_found = false;
1403  rectsearch(&table_grid_);
1404  rectsearch.StartRectSearch(search_region);
1405  ColSegment* neighbor = nullptr;
1406  while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1407  if (neighbor == seg)
1408  continue;
1409  const TBOX& neighbor_box = neighbor->bounding_box();
1410  // Check if a neighbor box has a large overlap with the table
1411  // region. This may happen as a result of merging two table
1412  // regions in the previous iteration.
1413  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1414  seg->InsertBox(neighbor_box);
1415  rectsearch.RemoveBBox();
1416  gsearch.RepositionIterator();
1417  delete neighbor;
1418  modified = true;
1419  continue;
1420  }
1421  // Check if two table regions belong together based on a common
1422  // horizontal ruling line
1423  if (BelongToOneTable(box, neighbor_box)) {
1424  seg->InsertBox(neighbor_box);
1425  neighbor_found = true;
1426  modified = true;
1427  rectsearch.RemoveBBox();
1428  gsearch.RepositionIterator();
1429  delete neighbor;
1430  }
1431  }
1432  } while (neighbor_found);
1433  if (modified) {
1434  // Because the box has changed, it has to be removed first.
1435  gsearch.RemoveBBox();
1436  table_grid_.InsertBBox(true, true, seg);
1437  gsearch.RepositionIterator();
1438  }
1439  }
1440 }
1441 
1442 // Decide if two table regions belong to one table based on a common
1443 // horizontal ruling line or another colpartition
1444 bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {
1445  // Check the obvious case. Most likely not true because overlapping boxes
1446  // should already be merged, but seems like a good thing to do in case things
1447  // change.
1448  if (box1.overlap(box2))
1449  return true;
1450  // Check for ColPartitions spanning both table regions
1451  TBOX bbox = box1.bounding_union(box2);
1452  // Start a rect search on bbox
1454  rectsearch(&clean_part_grid_);
1455  rectsearch.StartRectSearch(bbox);
1456  ColPartition* part = nullptr;
1457  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1458  const TBOX& part_box = part->bounding_box();
1459  // return true if a colpartition spanning both table regions is found
1460  if (part_box.overlap(box1) && part_box.overlap(box2) &&
1461  !part->IsImageType())
1462  return true;
1463  }
1464  return false;
1465 }
1466 
1467 // Adjust table boundaries by:
1468 // - building a tight bounding box around all ColPartitions contained in it.
1469 // - expanding table boundaries to include all colpartitions that overlap the
1470 // table by more than half of their area
1471 // - expanding table boundaries to include nearby horizontal rule lines
1472 // - expanding table vertically to include left out column headers
1473 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually
1474 // makes following errors:
1475 // 1- horizontal lines consisting of underlines are included in the table if
1476 // they are close enough
1477 // 2- horizontal lines originating from noise tend to get merged with a table
1478 // near the top of the page
1479 // 3- the criteria for including horizontal lines is very generous. Many times
1480 // horizontal lines separating headers and footers get merged with a
1481 // single-column table in a multi-column page thereby including text
1482 // from the neighboring column inside the table
1483 // 4- the criteria for including left out column headers also tends to
1484 // occasionally include text-lines above the tables, typically from
1485 // table caption
1487  // Iterate the table regions in the grid
1488  ColSegment_CLIST adjusted_tables;
1489  ColSegment_C_IT it(&adjusted_tables);
1491  gsearch.StartFullSearch();
1492  ColSegment* table = nullptr;
1493  while ((table = gsearch.NextFullSearch()) != nullptr) {
1494  const TBOX& table_box = table->bounding_box();
1495  TBOX grown_box = table_box;
1496  GrowTableBox(table_box, &grown_box);
1497  // To prevent a table from expanding again, do not insert the
1498  // modified box back to the grid. Instead move it to a list and
1499  // and remove it from the grid. The list is moved later back to the grid.
1500  if (!grown_box.null_box()) {
1501  auto* col = new ColSegment();
1502  col->InsertBox(grown_box);
1503  it.add_after_then_move(col);
1504  }
1505  gsearch.RemoveBBox();
1506  delete table;
1507  }
1508  // clear table grid to move final tables in it
1509  // TODO(nbeato): table_grid_ should already be empty. The above loop
1510  // removed everything. Maybe just assert it is empty?
1511  table_grid_.Clear();
1512  it.move_to_first();
1513  // move back final tables to table_grid_
1514  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1515  ColSegment* seg = it.extract();
1516  table_grid_.InsertBBox(true, true, seg);
1517  }
1518 }
1519 
1520 void TableFinder::GrowTableBox(const TBOX& table_box, TBOX* result_box) {
1521  // TODO(nbeato): The growing code is a bit excessive right now.
1522  // By removing these lines, the partitions considered need
1523  // to have some overlap or be special cases. These lines could
1524  // be added again once a check is put in place to make sure that
1525  // growing tables don't stomp on a lot of non-table partitions.
1526 
1527  // search for horizontal ruling lines within the vertical margin
1528  // int vertical_margin = kRulingVerticalMargin * gridsize();
1529  TBOX search_box = table_box;
1530  // int top = MIN(search_box.top() + vertical_margin, tright().y());
1531  // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1532  // search_box.set_top(top);
1533  // search_box.set_bottom(bottom);
1534 
1535  GrowTableToIncludePartials(table_box, search_box, result_box);
1536  GrowTableToIncludeLines(table_box, search_box, result_box);
1537  IncludeLeftOutColumnHeaders(result_box);
1538 }
1539 
1540 // Grow a table by increasing the size of the box to include
1541 // partitions with significant overlap with the table.
1543  const TBOX& search_range,
1544  TBOX* result_box) {
1545  // Rulings are in a different grid, so search 2 grids for rulings, text,
1546  // and table partitions that are not entirely within the new box.
1547  for (int i = 0; i < 2; ++i) {
1548  ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ :
1550  ColPartitionGridSearch rectsearch(grid);
1551  rectsearch.StartRectSearch(search_range);
1552  ColPartition* part = nullptr;
1553  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1554  // Only include text and table types.
1555  if (part->IsImageType())
1556  continue;
1557  const TBOX& part_box = part->bounding_box();
1558  // Include partition in the table if more than half of it
1559  // is covered by the table
1560  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1561  *result_box = result_box->bounding_union(part_box);
1562  continue;
1563  }
1564  }
1565  }
1566 }
1567 
1568 // Grow a table by expanding to the extents of significantly
1569 // overlapping lines.
1571  const TBOX& search_range,
1572  TBOX* result_box) {
1574  rsearch.SetUniqueMode(true);
1575  rsearch.StartRectSearch(search_range);
1576  ColPartition* part = nullptr;
1577  while ((part = rsearch.NextRectSearch()) != nullptr) {
1578  // TODO(nbeato) This should also do vertical, but column
1579  // boundaries are breaking things. This function needs to be
1580  // updated to allow vertical lines as well.
1581  if (!part->IsLineType())
1582  continue;
1583  // Avoid the following function call if the result of the
1584  // function is irrelevant.
1585  const TBOX& part_box = part->bounding_box();
1586  if (result_box->contains(part_box))
1587  continue;
1588  // Include a partially overlapping horizontal line only if the
1589  // extra ColPartitions that will be included due to expansion
1590  // have large side spacing w.r.t. columns containing them.
1591  if (HLineBelongsToTable(*part, table_box))
1592  *result_box = result_box->bounding_union(part_box);
1593  // TODO(nbeato): Vertical
1594  }
1595 }
1596 
1597 // Checks whether the horizontal line belong to the table by looking at the
1598 // side spacing of extra ColParitions that will be included in the table
1599 // due to expansion
1601  const TBOX& table_box) {
1602  if (!part.IsHorizontalLine())
1603  return false;
1604  const TBOX& part_box = part.bounding_box();
1605  if (!part_box.major_x_overlap(table_box))
1606  return false;
1607  // Do not consider top-most horizontal line since it usually
1608  // originates from noise.
1609  // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1610  // have neighbors solved.
1611  // if (!part.nearest_neighbor_above())
1612  // return false;
1613  const TBOX bbox = part_box.bounding_union(table_box);
1614  // In the "unioned table" box (the table extents expanded by the line),
1615  // keep track of how many partitions have significant padding to the left
1616  // and right. If more than half of the partitions covered by the new table
1617  // have significant spacing, the line belongs to the table and the table
1618  // grows to include all of the partitions.
1619  int num_extra_partitions = 0;
1620  int extra_space_to_right = 0;
1621  int extra_space_to_left = 0;
1622  // Rulings are in a different grid, so search 2 grids for rulings, text,
1623  // and table partitions that are introduced by the new box.
1624  for (int i = 0; i < 2; ++i) {
1625  ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ :
1627  // Start a rect search on bbox
1628  ColPartitionGridSearch rectsearch(grid);
1629  rectsearch.SetUniqueMode(true);
1630  rectsearch.StartRectSearch(bbox);
1631  ColPartition* extra_part = nullptr;
1632  while ((extra_part = rectsearch.NextRectSearch()) != nullptr) {
1633  // ColPartition already in table
1634  const TBOX& extra_part_box = extra_part->bounding_box();
1635  if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable)
1636  continue;
1637  // Non-text ColPartitions do not contribute
1638  if (extra_part->IsImageType())
1639  continue;
1640  // Consider this partition.
1641  num_extra_partitions++;
1642  // presence of a table cell is a strong hint, so just increment the scores
1643  // without looking at the spacing.
1644  if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1645  extra_space_to_right++;
1646  extra_space_to_left++;
1647  continue;
1648  }
1649  int space_threshold = kSideSpaceMargin * part.median_height();
1650  if (extra_part->space_to_right() > space_threshold)
1651  extra_space_to_right++;
1652  if (extra_part->space_to_left() > space_threshold)
1653  extra_space_to_left++;
1654  }
1655  }
1656  // tprintf("%d %d %d\n",
1657  // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1658  return (extra_space_to_right > num_extra_partitions / 2) ||
1659  (extra_space_to_left > num_extra_partitions / 2);
1660 }
1661 
1662 // Look for isolated column headers above the given table box and
1663 // include them in the table
1665  // Start a search above the current table to look for column headers
1667  vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1668  table_box->top());
1669  ColPartition* neighbor = nullptr;
1670  ColPartition* previous_neighbor = nullptr;
1671  while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) {
1672  // Max distance to find a table heading.
1673  const int max_distance = kMaxColumnHeaderDistance *
1674  neighbor->median_height();
1675  int table_top = table_box->top();
1676  const TBOX& box = neighbor->bounding_box();
1677  // Do not continue if the next box is way above
1678  if (box.bottom() - table_top > max_distance)
1679  break;
1680  // Unconditionally include partitions of type TABLE or LINE
1681  // TODO(faisal): add some reasonable conditions here
1682  if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1683  table_box->set_top(box.top());
1684  previous_neighbor = nullptr;
1685  continue;
1686  }
1687  // If there are two text partitions, one above the other, without a table
1688  // cell on their left or right side, consider them a barrier and quit
1689  if (previous_neighbor == nullptr) {
1690  previous_neighbor = neighbor;
1691  } else {
1692  const TBOX& previous_box = previous_neighbor->bounding_box();
1693  if (!box.major_y_overlap(previous_box))
1694  break;
1695  }
1696  }
1697 }
1698 
1699 // Remove false alarms consisting of a single column based on their
1700 // projection on the x-axis. Projection of a real table on the x-axis
1701 // should have at least one zero-valley larger than the global median
1702 // x-height of the page.
1704  int page_width = tright().x() - bleft().x();
1705  ASSERT_HOST(page_width > 0);
1706  // create an integer array to hold projection on x-axis
1707  int* table_xprojection = new int[page_width];
1708  // Iterate through all tables in the table grid
1710  table_search(&table_grid_);
1711  table_search.StartFullSearch();
1712  ColSegment* table;
1713  while ((table = table_search.NextFullSearch()) != nullptr) {
1714  TBOX table_box = table->bounding_box();
1715  // reset the projection array
1716  for (int i = 0; i < page_width; i++) {
1717  table_xprojection[i] = 0;
1718  }
1719  // Start a rect search on table_box
1721  rectsearch(&clean_part_grid_);
1722  rectsearch.SetUniqueMode(true);
1723  rectsearch.StartRectSearch(table_box);
1724  ColPartition* part;
1725  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1726  if (!part->IsTextType())
1727  continue; // Do not consider non-text partitions
1728  if (part->flow() == BTFT_LEADER)
1729  continue; // Assume leaders are in tables
1730  TBOX part_box = part->bounding_box();
1731  // Do not consider partitions partially covered by the table
1732  if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
1733  continue;
1734  BLOBNBOX_CLIST* part_boxes = part->boxes();
1735  BLOBNBOX_C_IT pit(part_boxes);
1736 
1737  // Make sure overlapping blobs don't artificially inflate the number
1738  // of rows in the table. This happens frequently with things such as
1739  // decimals and split characters. Do this by assuming the column
1740  // partition is sorted mostly left to right and just clip
1741  // bounding boxes by the previous box's extent.
1742  int next_position_to_write = 0;
1743 
1744  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1745  BLOBNBOX *pblob = pit.data();
1746  // ignore blob height for the purpose of projection since we
1747  // are only interested in finding valleys
1748  int xstart = pblob->bounding_box().left();
1749  int xend = pblob->bounding_box().right();
1750 
1751  xstart = std::max(xstart, next_position_to_write);
1752  for (int i = xstart; i < xend; i++)
1753  table_xprojection[i - bleft().x()]++;
1754  next_position_to_write = xend;
1755  }
1756  }
1757  // Find largest valley between two reasonable peaks in the table
1758  if (!GapInXProjection(table_xprojection, page_width)) {
1759  table_search.RemoveBBox();
1760  delete table;
1761  }
1762  }
1763  delete[] table_xprojection;
1764 }
1765 
1766 // Return true if at least one gap larger than the global x-height
1767 // exists in the horizontal projection
1768 bool TableFinder::GapInXProjection(int* xprojection, int length) {
1769  // Find peak value of the histogram
1770  int peak_value = 0;
1771  for (int i = 0; i < length; i++) {
1772  if (xprojection[i] > peak_value) {
1773  peak_value = xprojection[i];
1774  }
1775  }
1776  // Peak value represents the maximum number of horizontally
1777  // overlapping colpartitions, so this can be considered as the
1778  // number of rows in the table
1779  if (peak_value < kMinRowsInTable)
1780  return false;
1781  double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1782  if (peak_value >= kLargeTableRowCount)
1783  projection_threshold = kLargeTableProjectionThreshold * peak_value;
1784  // Threshold the histogram
1785  for (int i = 0; i < length; i++) {
1786  xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1787  }
1788  // Find the largest run of zeros between two ones
1789  int largest_gap = 0;
1790  int run_start = -1;
1791  for (int i = 1; i < length; i++) {
1792  // detect start of a run of zeros
1793  if (xprojection[i - 1] && !xprojection[i]) {
1794  run_start = i;
1795  }
1796  // detect end of a run of zeros and update the value of largest gap
1797  if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1798  int gap = i - run_start;
1799  if (gap > largest_gap)
1800  largest_gap = gap;
1801  run_start = -1;
1802  }
1803  }
1804  return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1805 }
1806 
1807 // Given the location of a table "guess", try to overlay a cellular
1808 // grid in the location, adjusting the boundaries.
1809 // TODO(nbeato): Falsely introduces:
1810 // -headers/footers (not any worse, too much overlap destroys cells)
1811 // -page numbers (not worse, included because maximize margins)
1812 // -equations (nicely fit into a celluar grid, but more sparsely)
1813 // -figures (random text box, also sparse)
1814 // -small left-aligned text areas with overlapping positioned whitespace
1815 // (rejected before)
1816 // Overall, this just needs some more work.
1818  ScrollView* table_win = nullptr;
1819  if (textord_show_tables) {
1820  table_win = MakeWindow(0, 0, "Table Structure");
1823  // table_grid_.DisplayBoxes(table_win);
1824  }
1825 
1826 
1827  TableRecognizer recognizer;
1828  recognizer.Init();
1830  recognizer.set_text_grid(&fragmented_text_grid_);
1831  recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1832  recognizer.set_min_height(1.5 * gridheight());
1833  // Loop over all of the tables and try to fit them.
1834  // Store the good tables here.
1835  ColSegment_CLIST good_tables;
1836  ColSegment_C_IT good_it(&good_tables);
1837 
1839  gsearch.StartFullSearch();
1840  ColSegment* found_table = nullptr;
1841  while ((found_table = gsearch.NextFullSearch()) != nullptr) {
1842  gsearch.RemoveBBox();
1843 
1844  // The goal is to make the tables persistent in a list.
1845  // When that happens, this will move into the search loop.
1846  const TBOX& found_box = found_table->bounding_box();
1847  StructuredTable* table_structure = recognizer.RecognizeTable(found_box);
1848 
1849  // Process a table. Good tables are inserted into the grid again later on
1850  // We can't change boxes in the grid while it is running a search.
1851  if (table_structure != nullptr) {
1852  if (textord_show_tables) {
1853  table_structure->Display(table_win, ScrollView::LIME_GREEN);
1854  }
1855  found_table->set_bounding_box(table_structure->bounding_box());
1856  delete table_structure;
1857  good_it.add_after_then_move(found_table);
1858  } else {
1859  delete found_table;
1860  }
1861  }
1862  // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1863 
1864  // At this point, the grid is empty. We can safely insert the good tables
1865  // back into grid.
1866  for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
1867  table_grid_.InsertBBox(true, true, good_it.extract());
1868 }
1869 
1870 // Displays the column segments in some window.
1872  ColSegment_LIST *segments,
1873  ScrollView::Color color) {
1874 #ifndef GRAPHICS_DISABLED
1875  win->Pen(color);
1876  win->Brush(ScrollView::NONE);
1877  ColSegment_IT it(segments);
1878  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1879  ColSegment* col = it.data();
1880  const TBOX& box = col->bounding_box();
1881  int left_x = box.left();
1882  int right_x = box.right();
1883  int top_y = box.top();
1884  int bottom_y = box.bottom();
1885  win->Rectangle(left_x, bottom_y, right_x, top_y);
1886  }
1887  win->UpdateWindow();
1888 #endif
1889 }
1890 
1892  ScrollView::Color color) {
1893 #ifndef GRAPHICS_DISABLED
1894  // Iterate the ColPartitions in the grid.
1896  gsearch(grid);
1897  gsearch.StartFullSearch();
1898  ColSegment* seg = nullptr;
1899  while ((seg = gsearch.NextFullSearch()) != nullptr) {
1900  const TBOX& box = seg->bounding_box();
1901  int left_x = box.left();
1902  int right_x = box.right();
1903  int top_y = box.top();
1904  int bottom_y = box.bottom();
1905  win->Brush(ScrollView::NONE);
1906  win->Pen(color);
1907  win->Rectangle(left_x, bottom_y, right_x, top_y);
1908  }
1909  win->UpdateWindow();
1910 #endif
1911 }
1912 
1913 // Displays the colpartitions using a new coloring on an existing window.
1914 // Note: This method is only for debug purpose during development and
1915 // would not be part of checked in code
1917  ColPartitionGrid* grid,
1918  ScrollView::Color default_color,
1919  ScrollView::Color table_color) {
1920 #ifndef GRAPHICS_DISABLED
1921  ScrollView::Color color = default_color;
1922  // Iterate the ColPartitions in the grid.
1924  gsearch(grid);
1925  gsearch.StartFullSearch();
1926  ColPartition* part = nullptr;
1927  while ((part = gsearch.NextFullSearch()) != nullptr) {
1928  color = default_color;
1929  if (part->type() == PT_TABLE)
1930  color = table_color;
1931 
1932  const TBOX& box = part->bounding_box();
1933  int left_x = box.left();
1934  int right_x = box.right();
1935  int top_y = box.top();
1936  int bottom_y = box.bottom();
1937  win->Brush(ScrollView::NONE);
1938  win->Pen(color);
1939  win->Rectangle(left_x, bottom_y, right_x, top_y);
1940  }
1941  win->UpdateWindow();
1942 #endif
1943 }
1945  ColPartitionGrid* grid,
1946  ScrollView::Color default_color) {
1947  DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1948 }
1949 
1951  ScrollView* win,
1952  ColPartitionGrid* grid,
1953  ScrollView::Color color) {
1954 #ifndef GRAPHICS_DISABLED
1955  // Iterate the ColPartitions in the grid.
1957  gsearch(grid);
1958  gsearch.StartFullSearch();
1959  ColPartition* part = nullptr;
1960  while ((part = gsearch.NextFullSearch()) != nullptr) {
1961  const TBOX& box = part->bounding_box();
1962  int left_x = box.left();
1963  int right_x = box.right();
1964  int top_y = box.top();
1965  int bottom_y = box.bottom();
1966 
1967  ColPartition* upper_part = part->nearest_neighbor_above();
1968  if (upper_part) {
1969  const TBOX& upper_box = upper_part->bounding_box();
1970  int mid_x = (left_x + right_x) / 2;
1971  int mid_y = (top_y + bottom_y) / 2;
1972  int other_x = (upper_box.left() + upper_box.right()) / 2;
1973  int other_y = (upper_box.top() + upper_box.bottom()) / 2;
1974  win->Brush(ScrollView::NONE);
1975  win->Pen(color);
1976  win->Line(mid_x, mid_y, other_x, other_y);
1977  }
1978  ColPartition* lower_part = part->nearest_neighbor_below();
1979  if (lower_part) {
1980  const TBOX& lower_box = lower_part->bounding_box();
1981  int mid_x = (left_x + right_x) / 2;
1982  int mid_y = (top_y + bottom_y) / 2;
1983  int other_x = (lower_box.left() + lower_box.right()) / 2;
1984  int other_y = (lower_box.top() + lower_box.bottom()) / 2;
1985  win->Brush(ScrollView::NONE);
1986  win->Pen(color);
1987  win->Line(mid_x, mid_y, other_x, other_y);
1988  }
1989  }
1990  win->UpdateWindow();
1991 #endif
1992 }
1993 
1994 // Merge all colpartitions in table regions to make them a single
1995 // colpartition and revert types of isolated table cells not
1996 // assigned to any table to their original types.
1998  ColPartitionSet** all_columns,
1999  WidthCallback width_cb) {
2000  // Since we have table blocks already, remove table tags from all
2001  // colpartitions
2003  gsearch(grid);
2004  gsearch.StartFullSearch();
2005  ColPartition* part = nullptr;
2006 
2007  while ((part = gsearch.NextFullSearch()) != nullptr) {
2008  if (part->type() == PT_TABLE) {
2009  part->clear_table_type();
2010  }
2011  }
2012  // Now make a single colpartition out of each table block and remove
2013  // all colpartitions contained within a table
2015  table_search(&table_grid_);
2016  table_search.StartFullSearch();
2017  ColSegment* table;
2018  while ((table = table_search.NextFullSearch()) != nullptr) {
2019  const TBOX& table_box = table->bounding_box();
2020  // Start a rect search on table_box
2022  rectsearch(grid);
2023  rectsearch.StartRectSearch(table_box);
2024  ColPartition* part;
2025  ColPartition* table_partition = nullptr;
2026  while ((part = rectsearch.NextRectSearch()) != nullptr) {
2027  // Do not consider image partitions
2028  if (!part->IsTextType())
2029  continue;
2030  TBOX part_box = part->bounding_box();
2031  // Include partition in the table if more than half of it
2032  // is covered by the table
2033  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2034  rectsearch.RemoveBBox();
2035  if (table_partition) {
2036  table_partition->Absorb(part, width_cb);
2037  } else {
2038  table_partition = part;
2039  }
2040  }
2041  }
2042  // Insert table colpartition back to part_grid_
2043  if (table_partition) {
2044  // To match the columns used when transforming to blocks, the new table
2045  // partition must have its first and last column set at the grid y that
2046  // corresponds to its bottom.
2047  const TBOX& table_box = table_partition->bounding_box();
2048  int grid_x, grid_y;
2049  grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2050  table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2051  table_partition->set_table_type();
2052  table_partition->set_blob_type(BRT_TEXT);
2053  table_partition->set_flow(BTFT_CHAIN);
2054  table_partition->SetBlobTypes();
2055  grid->InsertBBox(true, true, table_partition);
2056  }
2057  }
2058 }
2059 
2063  : ELIST_LINK(),
2064  num_table_cells_(0),
2065  num_text_cells_(0),
2066  type_(COL_UNKNOWN) {
2067 }
2068 
2069 // Provides a color for BBGrid to draw the rectangle.
2071  const ScrollView::Color kBoxColors[PT_COUNT] = {
2076  };
2077  return kBoxColors[type_];
2078 }
2079 
2080 // Insert a box into this column segment
2081 void ColSegment::InsertBox(const TBOX& other) {
2082  bounding_box_ = bounding_box_.bounding_union(other);
2083 }
2084 
2085 // Set column segment type based on the ratio of text and table partitions
2086 // in it.
2088  if (num_table_cells_ > kTableColumnThreshold * num_text_cells_)
2089  type_ = COL_TABLE;
2090  else if (num_text_cells_ > num_table_cells_)
2091  type_ = COL_TEXT;
2092  else
2093  type_ = COL_MIXED;
2094 }
2095 
2096 } // namespace tesseract.
tesseract::kMaxBlobWidth
const int kMaxBlobWidth
Definition: tablefind.cpp:39
tesseract::TableFinder::InsertImagePartition
void InsertImagePartition(ColPartition *part)
Definition: tablefind.cpp:421
ScrollView
Definition: scrollview.h:97
tesseract::TableFinder::FilterHeaderAndFooter
void FilterHeaderAndFooter()
Definition: tablefind.cpp:1074
ScrollView::Brush
void Brush(Color color)
Definition: scrollview.cpp:723
tesseract::TableFinder::GroupColumnBlocks
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:538
tesseract::kLargeTableRowCount
const int kLargeTableRowCount
Definition: tablefind.cpp:108
tesseract::TableFinder::set_global_median_xheight
void set_global_median_xheight(int xheight)
Definition: tablefind.cpp:756
tesseract::kStrokeWidthFractionalTolerance
const double kStrokeWidthFractionalTolerance
Definition: tablefind.cpp:139
tesseract::GridSearch::StartRectSearch
void StartRectSearch(const TBOX &rect)
Definition: bbgrid.h:830
tesseract::TableFinder::InsertFragmentedTextPartition
void InsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:402
tesseract::ColPartition::inside_table_column
bool inside_table_column()
Definition: colpartition.h:243
tesseract::TableFinder::DisplayColSegmentGrid
void DisplayColSegmentGrid(ScrollView *win, ColSegmentGrid *grid, ScrollView::Color color)
Definition: tablefind.cpp:1891
tesseract::GridSearch::StartSideSearch
void StartSideSearch(int x, int ymin, int ymax)
Definition: bbgrid.h:746
TBOX::intersection
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:83
tesseract::TableFinder::left_to_right_language_
bool left_to_right_language_
Definition: tablefind.h:424
tesseract::GridSearch::RepositionIterator
void RepositionIterator()
Definition: bbgrid.h:892
tesseract::ColPartition::IsLineType
bool IsLineType() const
Definition: colpartition.h:425
tesseract::TableFinder::table_grid_
ColSegmentGrid table_grid_
Definition: tablefind.h:422
tesseract::TableFinder::GridMergeColumnBlocks
void GridMergeColumnBlocks()
Definition: tablefind.cpp:1195
tesseract::BBGrid::InsertBBox
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
tesseract::COL_TEXT
Definition: tablefind.h:31
tesseract::BBGrid::MakeWindow
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:589
BRT_NOISE
Definition: blobbox.h:72
tesseract::TableRecognizer
Definition: tablerecog.h:257
tesseract::GridSearch::StartVerticalSearch
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:788
tesseract::kMaxXProjectionGapFactor
const double kMaxXProjectionGapFactor
Definition: tablefind.cpp:135
tesseract::kMinRowsInTable
const int kMinRowsInTable
Definition: tablefind.cpp:111
tesseract::kTableColumnThreshold
const double kTableColumnThreshold
Definition: tablefind.cpp:88
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::ColPartition::median_width
int median_width() const
Definition: colpartition.h:142
tesseract::kMaxVerticalSpacing
const int kMaxVerticalSpacing
Definition: tablefind.cpp:37
tesseract::ColPartition::space_below
int space_below() const
Definition: colpartition.h:267
tesseract::ColPartition::flow
BlobTextFlowType flow() const
Definition: colpartition.h:154
tesseract::ColPartition::set_space_above
void set_space_above(int space)
Definition: colpartition.h:264
tesseract::ColSegment
Definition: tablefind.h:45
tesseract::TableFinder::set_global_median_ledding
void set_global_median_ledding(int ledding)
Definition: tablefind.cpp:762
tesseract::GridSearch::NextRectSearch
BBC * NextRectSearch()
Definition: bbgrid.h:842
tesseract::ColSegment::set_num_text_cells
void set_num_text_cells(int n)
Definition: tablefind.h:89
tesseract::ColPartition::MatchingSizes
bool MatchingSizes(const ColPartition &other) const
Definition: colpartition.cpp:405
tesseract::kLargeTableProjectionThreshold
const double kLargeTableProjectionThreshold
Definition: tablefind.cpp:106
tesseract::TableRecognizer::set_min_height
void set_min_height(int height)
Definition: tablerecog.cpp:725
tesseract::TableFinder::AllowBlob
bool AllowBlob(const BLOBNBOX &blob) const
Definition: tablefind.cpp:502
TBOX::overlap
bool overlap(const TBOX &box) const
Definition: rect.h:350
ICOORD
integer coordinate
Definition: points.h:30
tesseract::WidthCallback
std::function< bool(int)> WidthCallback
Definition: tabfind.h:35
tesseract::TableFinder::DisplayColPartitionConnections
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
Definition: tablefind.cpp:1950
tesseract::GridBase::gridwidth
int gridwidth() const
Definition: bbgrid.h:66
PT_HEADING_IMAGE
Definition: capi.h:118
tesseract::GridSearch::StartFullSearch
void StartFullSearch()
Definition: bbgrid.h:665
tesseract::ColPartition::median_height
int median_height() const
Definition: colpartition.h:136
tesseract::ColPartition::set_space_to_left
void set_space_to_left(int space)
Definition: colpartition.h:276
TBOX::top
int16_t top() const
Definition: rect.h:57
TBOX::contains
bool contains(const FCOORD pt) const
Definition: rect.h:330
tesseract::kMinBoxesInTextPartition
const int kMinBoxesInTextPartition
Definition: tablefind.cpp:62
tesseract::ColPartition::type
PolyBlockType type() const
Definition: colpartition.h:181
tesseract::BBGrid::Clear
void Clear()
Definition: bbgrid.h:455
TBOX::bounding_union
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:124
tesseract::TableFinder::RecognizeTables
void RecognizeTables()
Definition: tablefind.cpp:1817
TBOX::area
int32_t area() const
Definition: rect.h:121
tesseract::TableFinder::~TableFinder
~TableFinder()
Definition: tablefind.cpp:167
TO_BLOCK
Definition: blobbox.h:691
TBOX::set_top
void set_top(int y)
Definition: rect.h:60
PT_FLOWING_IMAGE
Definition: capi.h:117
colpartitionset.h
tesseract::TableFinder::set_global_median_blob_width
void set_global_median_blob_width(int width)
Definition: tablefind.cpp:759
tesseract::TableFinder::DisplayColSegments
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
Definition: tablefind.cpp:1871
tesseract::TableFinder::HasLeaderAdjacent
bool HasLeaderAdjacent(const ColPartition &part)
Definition: tablefind.cpp:946
tesseract::TableFinder::AdjustTableBoundaries
void AdjustTableBoundaries()
Definition: tablefind.cpp:1486
ScrollView::NONE
Definition: scrollview.h:101
tesseract::TableRecognizer::set_max_text_height
void set_max_text_height(int height)
Definition: tablerecog.cpp:731
ScrollView::Pen
void Pen(Color color)
Definition: scrollview.cpp:717
tesseract::ColSegment::set_bounding_box
void set_bounding_box(const TBOX &other)
Definition: tablefind.h:71
tesseract::ColPartitionSet
Definition: colpartitionset.h:39
tesseract::TableFinder::ConsecutiveBoxes
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
Definition: tablefind.cpp:568
tesseract::ColPartition::median_bottom
int median_bottom() const
Definition: colpartition.h:127
tesseract::kMaxBoxesInDataPartition
const int kMaxBoxesInDataPartition
Definition: tablefind.cpp:65
TBOX::major_y_overlap
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:428
PT_TABLE
Definition: capi.h:114
tesseract::TableFinder::InsertLeaderPartition
void InsertLeaderPartition(ColPartition *part)
Definition: tablefind.cpp:410
tesseract::kMinParagraphEndingTextToWhitespaceRatio
const double kMinParagraphEndingTextToWhitespaceRatio
Definition: tablefind.cpp:131
tesseract::kMaxGapInTextPartition
const double kMaxGapInTextPartition
Definition: tablefind.cpp:68
tesseract::TableFinder::SplitAndInsertFragmentedTextPartition
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:436
tesseract::ColPartition::boxes
BLOBNBOX_CLIST * boxes()
Definition: colpartition.h:187
ICOORD::x
int16_t x() const
access function
Definition: points.h:51
FCOORD
Definition: points.h:187
BLOBNBOX
Definition: blobbox.h:142
BTFT_CHAIN
Definition: blobbox.h:117
tesseract::TableFinder::bleft
const ICOORD & bleft() const
Definition: tablefind.cpp:387
ScrollView::BLUE
Definition: scrollview.h:108
BTFT_LEADER
Definition: blobbox.h:120
PT_COUNT
Definition: capi.h:123
tesseract::TableFinder::IncludeLeftOutColumnHeaders
void IncludeLeftOutColumnHeaders(TBOX *table_box)
Definition: tablefind.cpp:1664
tesseract::kMaxBlobOverlapFactor
const double kMaxBlobOverlapFactor
Definition: tablefind.cpp:76
tesseract::ColPartition
Definition: colpartition.h:67
tesseract::GridBase::tright
const ICOORD & tright() const
Definition: bbgrid.h:75
TBOX::height
int16_t height() const
Definition: rect.h:107
tesseract::TableFinder::gridwidth
int gridwidth() const
Definition: tablefind.cpp:381
tesseract::TableFinder::AllowTextPartition
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:489
tesseract::ColPartitionSet::GetColumnBoxes
void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments)
Definition: colpartitionset.cpp:369
tesseract::TableRecognizer::set_line_grid
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:722
tesseract::ColPartition::SplitAt
ColPartition * SplitAt(int split_x)
Definition: colpartition.cpp:823
tesseract::TableFinder::resolution_
int resolution_
Definition: tablefind.h:402
tesseract::ColPartition::boxes_count
int boxes_count() const
Definition: colpartition.h:190
tesseract::ColPartition::set_blob_type
void set_blob_type(BlobRegionType t)
Definition: colpartition.h:151
TBOX::set_right
void set_right(int x)
Definition: rect.h:81
tesseract::kAllowBlobArea
const double kAllowBlobArea
Definition: tablefind.cpp:57
tesseract::ColPartition::SingletonPartner
ColPartition * SingletonPartner(bool upper)
Definition: colpartition.cpp:629
ScrollView::DARK_TURQUOISE
Definition: scrollview.h:146
tesseract::TableFinder::tright
const ICOORD & tright() const
Definition: tablefind.cpp:390
tesseract::COL_TABLE
Definition: tablefind.h:32
tesseract::GridSearch::GridY
int GridY() const
Definition: bbgrid.h:245
tesseract::ColSegment::InsertBox
void InsertBox(const TBOX &other)
Definition: tablefind.cpp:2081
tesseract::TableFinder::FilterParagraphEndings
void FilterParagraphEndings()
Definition: tablefind.cpp:994
tesseract::ColPartition::set_space_to_right
void set_space_to_right(int space)
Definition: colpartition.h:282
tesseract::GridSearch::NextSideSearch
BBC * NextSideSearch(bool right_to_left)
Definition: bbgrid.h:761
tesseract::ColPartitionGrid::RefinePartitionPartners
void RefinePartitionPartners(bool get_desperate)
Definition: colpartitiongrid.cpp:1287
tesseract::TableFinder::gridheight
int gridheight() const
Definition: tablefind.cpp:384
tesseract::TableFinder::leader_and_ruling_grid_
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:414
tesseract::kMinMaxGapInTextPartition
const double kMinMaxGapInTextPartition
Definition: tablefind.cpp:72
tesseract::TableFinder::GrowTableToIncludePartials
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1542
ScrollView::KHAKI
Definition: scrollview.h:134
ScrollView::ORANGE
Definition: scrollview.h:136
TBOX::major_x_overlap
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:403
tesseract::TableFinder::InsertRulingPartition
void InsertRulingPartition(ColPartition *part)
Definition: tablefind.cpp:418
tesseract::kSideSpaceMargin
const int kSideSpaceMargin
Definition: tablefind.cpp:101
tesseract::TableFinder::MarkPartitionsUsingLocalInformation
void MarkPartitionsUsingLocalInformation()
Definition: tablefind.cpp:827
tesseract::TableFinder::clean_part_grid_
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:412
tesseract::ColPartition::nearest_neighbor_below
ColPartition * nearest_neighbor_below() const
Definition: colpartition.h:255
ScrollView::UpdateWindow
void UpdateWindow()
Definition: scrollview.cpp:703
tesseract::ColPartition::IsImageType
bool IsImageType() const
Definition: colpartition.h:429
tesseract::kAllowTextHeight
const double kAllowTextHeight
Definition: tablefind.cpp:48
TBOX::null_box
bool null_box() const
Definition: rect.h:49
ScrollView::LIGHT_BLUE
Definition: scrollview.h:112
ScrollView::MAGENTA
Definition: scrollview.h:109
BRT_TEXT
Definition: blobbox.h:79
tesseract::TableFinder::TableFinder
TableFinder()
Definition: tablefind.cpp:159
tesseract::TableFinder::GetTableColumns
void GetTableColumns(ColSegment_LIST *table_columns)
Definition: tablefind.cpp:1273
tesseract::TableFinder::DeleteSingleColumnTables
void DeleteSingleColumnTables()
Definition: tablefind.cpp:1703
tesseract::TableFinder::HasWideOrNoInterWordGap
bool HasWideOrNoInterWordGap(ColPartition *part) const
Definition: tablefind.cpp:857
tesseract::ColPartition::blob_type
BlobRegionType blob_type() const
Definition: colpartition.h:148
tesseract::ColPartition::MidY
int MidY() const
Definition: colpartition.h:304
tesseract::kMaxColumnHeaderDistance
const int kMaxColumnHeaderDistance
Definition: tablefind.cpp:84
TBOX::width
int16_t width() const
Definition: rect.h:114
tesseract::kMaxParagraphEndingLeftSpaceMultiple
const double kMaxParagraphEndingLeftSpaceMultiple
Definition: tablefind.cpp:125
STATS::plot
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:558
BOOL_VAR
#define BOOL_VAR(name, val, comment)
Definition: params.h:303
tesseract::kMinOverlapWithTable
const double kMinOverlapWithTable
Definition: tablefind.cpp:96
ScrollView::YELLOW
Definition: scrollview.h:105
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
tesseract::TableFinder::GridMergeTableRegions
void GridMergeTableRegions()
Definition: tablefind.cpp:1386
tesseract::TableFinder::GrowTableBox
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
Definition: tablefind.cpp:1520
tesseract::ColSegment::set_num_table_cells
void set_num_table_cells(int n)
Definition: tablefind.h:80
tesseract::TableFinder::GrowTableToIncludeLines
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1570
tesseract::BBGrid< ColSegment, ColSegment_CLIST, ColSegment_C_IT >
tesseract::TableFinder::col_seg_grid_
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:420
TBOX::overlap_fraction
double overlap_fraction(const TBOX &box) const
Definition: rect.h:381
ScrollView::AQUAMARINE
Definition: scrollview.h:110
tesseract::TableFinder::FilterFalseAlarms
void FilterFalseAlarms()
Definition: tablefind.cpp:988
tesseract::TableRecognizer::RecognizeTable
StructuredTable * RecognizeTable(const TBOX &guess_box)
Definition: tablerecog.cpp:735
CLISTIZE
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
tesseract::GridSearch
Definition: bbgrid.h:48
tesseract::COL_MIXED
Definition: tablefind.h:33
tesseract
Definition: baseapi.h:65
STATS::median
double median() const
Definition: statistc.cpp:218
tesseract::ColSegment::set_type
void set_type()
Definition: tablefind.cpp:2087
tesseract::ColPartition::set_inside_table_column
void set_inside_table_column(bool val)
Definition: colpartition.h:246
tesseract::ColPartition::space_to_right
int space_to_right() const
Definition: colpartition.h:279
ScrollView::RED
Definition: scrollview.h:104
tesseract::TableFinder::InsertCleanPartitions
void InsertCleanPartitions(ColPartitionGrid *grid, TO_BLOCK *block)
Definition: tablefind.cpp:193
PT_UNKNOWN
Definition: capi.h:108
STATS
Definition: statistc.h:30
tesseract::TableRecognizer::set_text_grid
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:719
tesseract::DeleteObject
void DeleteObject(T *object)
Definition: tablefind.cpp:155
BLOBNBOX::bounding_box
const TBOX & bounding_box() const
Definition: blobbox.h:229
tesseract::GridSearch::NextVerticalSearch
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:802
tesseract::TableFinder::HLineBelongsToTable
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
Definition: tablefind.cpp:1600
tesseract::ColPartition::set_flow
void set_flow(BlobTextFlowType f)
Definition: colpartition.h:157
tesseract::TableFinder::SetVerticalSpacing
void SetVerticalSpacing(ColPartition *part)
Definition: tablefind.cpp:666
tesseract::COL_UNKNOWN
Definition: tablefind.h:30
tesseract::TableFinder::set_left_to_right_language
void set_left_to_right_language(bool order)
Definition: tablefind.cpp:177
tesseract::TableFinder::SetColumnsType
void SetColumnsType(ColSegment_LIST *col_segments)
Definition: tablefind.cpp:1143
tesseract::ColPartition::IsInSameColumnAs
bool IsInSameColumnAs(const ColPartition &part) const
Definition: colpartition.cpp:2175
tesseract::TableFinder::global_median_blob_width_
int global_median_blob_width_
Definition: tablefind.h:406
tesseract::ColPartition::SetBlobTypes
void SetBlobTypes()
Definition: colpartition.cpp:1265
tesseract::kSplitPartitionSize
const double kSplitPartitionSize
Definition: tablefind.cpp:43
tesseract::ColSegment::type
ColSegType type() const
Definition: tablefind.h:93
PT_PULLOUT_IMAGE
Definition: capi.h:119
tesseract::GridBase::gridsize
int gridsize() const
Definition: bbgrid.h:63
tesseract::ColPartition::IsEmpty
bool IsEmpty() const
Definition: colpartition.h:357
tesseract::StructuredTable::Display
void Display(ScrollView *window, ScrollView::Color color)
Definition: tablerecog.cpp:289
tesseract::BBGrid::ClearGridData
void ClearGridData(void(*free_method)(BBC *))
Definition: bbgrid.h:464
ScrollView::LIME_GREEN
Definition: scrollview.h:127
tesseract::ColPartition::bounding_box
const TBOX & bounding_box() const
Definition: colpartition.h:109
tesseract::TableFinder::MarkTablePartitions
void MarkTablePartitions()
Definition: tablefind.cpp:789
tesseract::ColPartition::space_to_left
int space_to_left() const
Definition: colpartition.h:273
tesseract::BBGrid::DisplayBoxes
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:613
tesseract::ColPartitionGrid::FindPartitionPartners
void FindPartitionPartners()
Definition: colpartitiongrid.cpp:1179
tesseract::ColPartition::AddBox
void AddBox(BLOBNBOX *box)
Definition: colpartition.cpp:169
tesseract::StructuredTable
Definition: tablerecog.h:72
tesseract::BBGrid::Init
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:445
tesseract::TableFinder::SmoothTablePartitionRuns
void SmoothTablePartitionRuns()
Definition: tablefind.cpp:1108
tesseract::ColPartition::set_nearest_neighbor_above
void set_nearest_neighbor_above(ColPartition *part)
Definition: colpartition.h:252
tesseract::TableFinder::Init
void Init(int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
Definition: tablefind.cpp:181
tesseract::TableFinder::DisplayColPartitions
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1916
tesseract::ColPartition::set_table_type
void set_table_type()
Definition: colpartition.h:233
BLOBNBOX::flow
BlobTextFlowType flow() const
Definition: blobbox.h:294
tesseract::ColPartition::RightAtY
int RightAtY(int y) const
Definition: colpartition.h:344
tesseract::ColPartition::ComputeLimits
void ComputeLimits()
Definition: colpartition.cpp:861
tesseract::ColPartition::SetPartitionType
void SetPartitionType(int resolution, ColPartitionSet *columns)
Definition: colpartition.cpp:973
tesseract::TableFinder::SetPartitionSpacings
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
Definition: tablefind.cpp:586
tesseract::kSmallTableProjectionThreshold
const double kSmallTableProjectionThreshold
Definition: tablefind.cpp:105
TBOX::left
int16_t left() const
Definition: rect.h:71
tesseract::ColPartitionGrid
Definition: colpartitiongrid.h:32
STATS::add
void add(int32_t value, int32_t count)
Definition: statistc.cpp:87
ScrollView::GREEN
Definition: scrollview.h:106
tesseract::TableFinder::SetGlobalSpacings
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:709
tesseract::ColSegment::bounding_box
const TBOX & bounding_box() const
Definition: tablefind.h:51
tesseract::TableFinder::global_median_ledding_
int global_median_ledding_
Definition: tablefind.h:408
tesseract::kAllowBlobWidth
const double kAllowBlobWidth
Definition: tablefind.cpp:56
PT_FLOWING_TEXT
Definition: capi.h:109
tesseract::kMaxTableCellXheight
const double kMaxTableCellXheight
Definition: tablefind.cpp:80
tesseract::TableFinder::global_median_xheight_
int global_median_xheight_
Definition: tablefind.h:404
tesseract::ColSegment::BoxColor
ScrollView::Color BoxColor() const
Definition: tablefind.cpp:2070
tesseract::kAdjacentLeaderSearchPadding
const int kAdjacentLeaderSearchPadding
Definition: tablefind.cpp:116
BLOBNBOX::region_type
BlobRegionType region_type() const
Definition: blobbox.h:282
tesseract::ColPartition::nearest_neighbor_above
ColPartition * nearest_neighbor_above() const
Definition: colpartition.h:249
tesseract::ColPartition::clear_table_type
void clear_table_type()
Definition: colpartition.h:239
TBOX::right
int16_t right() const
Definition: rect.h:78
tesseract::kAllowBlobHeight
const double kAllowBlobHeight
Definition: tablefind.cpp:55
tesseract::TableFinder::fragmented_text_grid_
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:418
tesseract::ColPartition::Absorb
void Absorb(ColPartition *other, WidthCallback cb)
Definition: colpartition.cpp:638
tesseract::TableFinder::InsertTextPartition
void InsertTextPartition(ColPartition *part)
Definition: tablefind.cpp:394
ScrollView::Line
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:531
tesseract::TableFinder::FindNeighbors
void FindNeighbors()
Definition: tablefind.cpp:766
tesseract::kStrokeWidthConstantTolerance
const double kStrokeWidthConstantTolerance
Definition: tablefind.cpp:140
tesseract::kAllowTextArea
const double kAllowTextArea
Definition: tablefind.cpp:50
tesseract::TableRecognizer::Init
void Init()
Definition: tablerecog.cpp:716
tesseract::GridSearch::SetUniqueMode
void SetUniqueMode(bool mode)
Definition: bbgrid.h:253
ELIST_LINK
Definition: elst.h:74
tesseract::TableFinder::MoveColSegmentsToGrid
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
Definition: tablefind.cpp:1176
tesseract::ColPartition::VSignificantCoreOverlap
bool VSignificantCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:390
tesseract::ColSegment::ColSegment
ColSegment()
Definition: tablefind.cpp:2062
tesseract::TableFinder::MakeWindow
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:518
tablerecog.h
tesseract::TableFinder::BelongToOneTable
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
Definition: tablefind.cpp:1444
TBOX::set_bottom
void set_bottom(int y)
Definition: rect.h:67
tesseract::ColPartition::MatchingStrokeWidth
bool MatchingStrokeWidth(const ColPartition &other, double fractional_tolerance, double constant_tolerance) const
Definition: colpartition.cpp:430
tesseract::TableFinder::GapInXProjection
bool GapInXProjection(int *xprojection, int length)
Definition: tablefind.cpp:1768
ScrollView::Color
Color
Definition: scrollview.h:100
tesseract::TableFinder::MakeTableBlocks
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback width_cb)
Definition: tablefind.cpp:1997
tesseract::TableFinder::LocateTables
void LocateTables(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback width_cb, const FCOORD &reskew)
Definition: tablefind.cpp:259
tesseract::TableFinder::gridsize
int gridsize() const
Definition: tablefind.cpp:378
ScrollView::Rectangle
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:599
tesseract::TableFinder::InitializePartitions
void InitializePartitions(ColPartitionSet **all_columns)
Definition: tablefind.cpp:579
tesseract::ColPartition::CopyButDontOwnBlobs
ColPartition * CopyButDontOwnBlobs()
Definition: colpartition.cpp:1758
tesseract::TableFinder::GetTableRegions
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
Definition: tablefind.cpp:1323
tesseract::ColPartition::LeftAtY
int LeftAtY(int y) const
Definition: colpartition.h:340
tesseract::ColPartition::IsTextType
bool IsTextType() const
Definition: colpartition.h:433
tesseract::ColPartitionSet::ColumnContaining
ColPartition * ColumnContaining(int x, int y)
Definition: colpartitionset.cpp:69
tesseract::ColPartition::set_nearest_neighbor_below
void set_nearest_neighbor_below(ColPartition *part)
Definition: colpartition.h:258
tesseract::TableFinder::GetColumnBlocks
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:523
tesseract::GridBase::gridheight
int gridheight() const
Definition: bbgrid.h:69
tesseract::GridSearch::RemoveBBox
void RemoveBBox()
Definition: bbgrid.h:866
tesseract::kAllowTextWidth
const double kAllowTextWidth
Definition: tablefind.cpp:49
tesseract::GridBase::bleft
const ICOORD & bleft() const
Definition: bbgrid.h:72
tablefind.h
tesseract::StructuredTable::bounding_box
const TBOX & bounding_box() const
Definition: tablerecog.cpp:109
tesseract::ColPartition::ShallowCopy
ColPartition * ShallowCopy() const
Definition: colpartition.cpp:1731
ELISTIZE
#define ELISTIZE(CLASSNAME)
Definition: elst.h:919
tesseract::GridBase::GridCoords
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:52
TBOX::set_left
void set_left(int x)
Definition: rect.h:74
tesseract::ColPartition::IsHorizontalLine
bool IsHorizontalLine() const
Definition: colpartition.h:459
tesseract::ColPartition::set_space_below
void set_space_below(int space)
Definition: colpartition.h:270
ICOORD::y
int16_t y() const
access_function
Definition: points.h:55
tesseract::kParagraphEndingPreviousLineRatio
const double kParagraphEndingPreviousLineRatio
Definition: tablefind.cpp:121
tesseract::GridSearch::NextFullSearch
BBC * NextFullSearch()
Definition: bbgrid.h:675
TBOX
Definition: rect.h:33
tesseract::ColPartition::space_above
int space_above() const
Definition: colpartition.h:261