tesseract  4.0.0-1-g2a2b
gap_map.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 
11 #include "statistc.h"
12 #include "gap_map.h"
13 
14 BOOL_VAR(gapmap_debug, FALSE, "Say which blocks have tables");
15 BOOL_VAR(gapmap_use_ends, FALSE, "Use large space at start and end of rows");
17 "Ensure gaps not less than 2quanta wide");
18 double_VAR(gapmap_big_gaps, 1.75, "xht multiplier");
19 
20 /*************************************************************************
21  * A block gap map is a quantised histogram of whitespace regions in the
22  * block. It is a vertical projection of wide gaps WITHIN lines
23  *
24  * The map is held as an array of counts of rows which have a wide gap
25  * covering that region of the row. Each bucket in the map represents a width
26  * of about half an xheight - (The median of the xhts in the rows is used.)
27  *
28  * The block is considered RECTANGULAR - delimited by the left and right
29  * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are
30  * counted.
31  *
32  *************************************************************************/
33 
34 GAPMAP::GAPMAP( //Constructor
35  TO_BLOCK *block //block
36  ) {
37  TO_ROW *row; //current row
38  BLOBNBOX_IT blob_it; //iterator
39  TBOX blob_box;
40  TBOX prev_blob_box;
41  int16_t gap_width;
42  int16_t start_of_row;
43  int16_t end_of_row;
44  STATS xht_stats (0, 128);
45  int16_t min_quantum;
46  int16_t max_quantum;
47  int16_t i;
48 
49  /*
50  Find left and right extremes and bucket size
51  */
52  map = nullptr;
53  min_left = INT16_MAX;
54  max_right = -INT16_MAX;
55  total_rows = 0;
56  any_tabs = false;
57 
58  // row iterator
59  TO_ROW_IT row_it(block->get_rows());
60  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
61  row = row_it.data ();
62  if (!row->blob_list ()->empty ()) {
63  total_rows++;
64  xht_stats.add ((int16_t) floor (row->xheight + 0.5), 1);
65  blob_it.set_to_list (row->blob_list ());
66  start_of_row = blob_it.data ()->bounding_box ().left ();
67  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
68  if (min_left > start_of_row)
69  min_left = start_of_row;
70  if (max_right < end_of_row)
71  max_right = end_of_row;
72  }
73  }
74  if ((total_rows < 3) || (min_left >= max_right)) {
75  bucket_size = 0;
76  map_max = 0;
77  total_rows = 0;
78  min_left = max_right = 0;
79  return;
80  }
81  bucket_size = (int16_t) floor (xht_stats.median () + 0.5) / 2;
82  map_max = (max_right - min_left) / bucket_size;
83  map = new int16_t[map_max + 1];
84  for (i = 0; i <= map_max; i++)
85  map[i] = 0;
86 
87  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
88  row = row_it.data ();
89  if (!row->blob_list ()->empty ()) {
90  blob_it.set_to_list (row->blob_list ());
91  blob_it.mark_cycle_pt ();
92  blob_box = box_next (&blob_it);
93  prev_blob_box = blob_box;
94  if (gapmap_use_ends) {
95  /* Leading space */
96  gap_width = blob_box.left () - min_left;
97  if ((gap_width > gapmap_big_gaps * row->xheight)
98  && gap_width > 2) {
99  max_quantum = (blob_box.left () - min_left) / bucket_size;
100  if (max_quantum > map_max) max_quantum = map_max;
101  for (i = 0; i <= max_quantum; i++)
102  map[i]++;
103  }
104  }
105  while (!blob_it.cycled_list ()) {
106  blob_box = box_next (&blob_it);
107  gap_width = blob_box.left () - prev_blob_box.right ();
108  if ((gap_width > gapmap_big_gaps * row->xheight)
109  && gap_width > 2) {
110  min_quantum =
111  (prev_blob_box.right () - min_left) / bucket_size;
112  max_quantum = (blob_box.left () - min_left) / bucket_size;
113  if (max_quantum > map_max) max_quantum = map_max;
114  for (i = min_quantum; i <= max_quantum; i++)
115  map[i]++;
116  }
117  prev_blob_box = blob_box;
118  }
119  if (gapmap_use_ends) {
120  /* Trailing space */
121  gap_width = max_right - prev_blob_box.right ();
122  if ((gap_width > gapmap_big_gaps * row->xheight)
123  && gap_width > 2) {
124  min_quantum =
125  (prev_blob_box.right () - min_left) / bucket_size;
126  if (min_quantum < 0) min_quantum = 0;
127  for (i = min_quantum; i <= map_max; i++)
128  map[i]++;
129  }
130  }
131  }
132  }
133  for (i = 0; i <= map_max; i++) {
134  if (map[i] > total_rows / 2) {
136  (((i == 0) &&
137  (map[i + 1] <= total_rows / 2)) ||
138  ((i == map_max) &&
139  (map[i - 1] <= total_rows / 2)) ||
140  ((i > 0) &&
141  (i < map_max) &&
142  (map[i - 1] <= total_rows / 2) &&
143  (map[i + 1] <= total_rows / 2)))) {
144  map[i] = 0; //prevent isolated quantum
145  }
146  else
147  any_tabs = true;
148  }
149  }
150  if (gapmap_debug && any_tabs)
151  tprintf ("Table found\n");
152 }
153 
154 
155 /*************************************************************************
156  * GAPMAP::table_gap()
157  * Is there a bucket in the specified range where more than half the rows in the
158  * block have a wide gap?
159  *************************************************************************/
160 
161 bool GAPMAP::table_gap( //Is gap a table?
162  int16_t left, //From here
163  int16_t right //To here
164 ) {
165  int16_t min_quantum;
166  int16_t max_quantum;
167  int16_t i;
168  bool tab_found = false;
169 
170  if (!any_tabs)
171  return false;
172 
173  min_quantum = (left - min_left) / bucket_size;
174  max_quantum = (right - min_left) / bucket_size;
175  // Clip to the bounds of the array. In some circumstances (big blob followed
176  // by small blob) max_quantum can exceed the map_max bounds, but we clip
177  // here instead, as it provides better long-term safety.
178  if (min_quantum < 0) min_quantum = 0;
179  if (max_quantum > map_max) max_quantum = map_max;
180  for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++)
181  if (map[i] > total_rows / 2)
182  tab_found = true;
183  return tab_found;
184 }
bool table_gap(int16_t left, int16_t right)
Definition: gap_map.cpp:161
#define BOOL_VAR(name, val, comment)
Definition: params.h:279
bool gapmap_no_isolated_quanta
Definition: gap_map.cpp:17
#define double_VAR(name, val, comment)
Definition: params.h:285
double gapmap_big_gaps
Definition: gap_map.cpp:18
Definition: rect.h:34
Definition: statistc.h:33
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:637
TO_ROW_LIST * get_rows()
Definition: blobbox.h:717
bool gapmap_use_ends
Definition: gap_map.cpp:15
float xheight
Definition: blobbox.h:670
int16_t left() const
Definition: rect.h:72
double median() const
Definition: statistc.cpp:238
#define FALSE
Definition: capi.h:52
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void add(int32_t value, int32_t count)
Definition: statistc.cpp:100
bool gapmap_debug
Definition: gap_map.cpp:14
GAPMAP(TO_BLOCK *block)
Definition: gap_map.cpp:34
int16_t right() const
Definition: rect.h:79
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:612