tesseract  4.0.0-1-g2a2b
tabvector.cpp
Go to the documentation of this file.
1 // File: tabvector.cpp
3 // Description: Class to hold a near-vertical vector representing a tab-stop.
4 // Author: Ray Smith
5 // Created: Thu Apr 10 16:28:01 PST 2008
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include "tabvector.h"
25 #include "blobbox.h"
26 #include "colfind.h"
27 #include "colpartitionset.h"
28 #include "detlinefit.h"
29 #include "statistc.h"
30 
31 #include <algorithm>
32 
33 namespace tesseract {
34 
35 // Multiple of height used as a gutter for evaluation search.
36 const int kGutterMultiple = 4;
37 // Multiple of neighbour gap that we expect the gutter gap to be at minimum.
39 // Pixel distance for tab vectors to be considered the same.
40 const int kSimilarVectorDist = 10;
41 // Pixel distance for ragged tab vectors to be considered the same if there
42 // is nothing in the overlap box
43 const int kSimilarRaggedDist = 50;
44 // Max multiple of height to allow filling in between blobs when evaluating.
45 const int kMaxFillinMultiple = 11;
46 // Min fraction of mean gutter size to allow a gutter on a good tab blob.
47 const double kMinGutterFraction = 0.5;
48 // Multiple of 1/n lines as a minimum gutter in evaluation.
49 const double kLineCountReciprocal = 4.0;
50 // Constant add-on for minimum gutter for aligned tabs.
51 const double kMinAlignedGutter = 0.25;
52 // Constant add-on for minimum gutter for ragged tabs.
53 const double kMinRaggedGutter = 1.5;
54 
56  "max fraction of mean blob width allowed for vertical gaps in vertical text");
57 
59  "Fraction of box matches required to declare a line vertical");
60 
62 
63 // Create a constraint for the top or bottom of this TabVector.
64 void TabConstraint::CreateConstraint(TabVector* vector, bool is_top) {
65  TabConstraint* constraint = new TabConstraint(vector, is_top);
66  TabConstraint_LIST* constraints = new TabConstraint_LIST;
67  TabConstraint_IT it(constraints);
68  it.add_to_end(constraint);
69  if (is_top)
70  vector->set_top_constraints(constraints);
71  else
72  vector->set_bottom_constraints(constraints);
73 }
74 
75 // Test to see if the constraints are compatible enough to merge.
76 bool TabConstraint::CompatibleConstraints(TabConstraint_LIST* list1,
77  TabConstraint_LIST* list2) {
78  if (list1 == list2)
79  return false;
80  int y_min = -INT32_MAX;
81  int y_max = INT32_MAX;
82  if (textord_debug_tabfind > 3)
83  tprintf("Testing constraint compatibility\n");
84  GetConstraints(list1, &y_min, &y_max);
85  GetConstraints(list2, &y_min, &y_max);
86  if (textord_debug_tabfind > 3)
87  tprintf("Resulting range = [%d,%d]\n", y_min, y_max);
88  return y_max >= y_min;
89 }
90 
91 // Merge the lists of constraints and update the TabVector pointers.
92 // The second list is deleted.
93 void TabConstraint::MergeConstraints(TabConstraint_LIST* list1,
94  TabConstraint_LIST* list2) {
95  if (list1 == list2)
96  return;
97  TabConstraint_IT it(list2);
98  if (textord_debug_tabfind > 3)
99  tprintf("Merging constraints\n");
100  // The vectors of all constraints on list2 are now going to be on list1.
101  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
102  TabConstraint* constraint = it.data();
103  if (textord_debug_tabfind> 3)
104  constraint->vector_->Print("Merge");
105  if (constraint->is_top_)
106  constraint->vector_->set_top_constraints(list1);
107  else
108  constraint->vector_->set_bottom_constraints(list1);
109  }
110  it = list1;
111  it.add_list_before(list2);
112  delete list2;
113 }
114 
115 // Set all the tops and bottoms as appropriate to a mean of the
116 // constrained range. Delete all the constraints and list.
117 void TabConstraint::ApplyConstraints(TabConstraint_LIST* constraints) {
118  int y_min = -INT32_MAX;
119  int y_max = INT32_MAX;
120  GetConstraints(constraints, &y_min, &y_max);
121  int y = (y_min + y_max) / 2;
122  TabConstraint_IT it(constraints);
123  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
124  TabConstraint* constraint = it.data();
125  TabVector* v = constraint->vector_;
126  if (constraint->is_top_) {
127  v->SetYEnd(y);
128  v->set_top_constraints(nullptr);
129  } else {
130  v->SetYStart(y);
131  v->set_bottom_constraints(nullptr);
132  }
133  }
134  delete constraints;
135 }
136 
137 TabConstraint::TabConstraint(TabVector* vector, bool is_top)
138  : vector_(vector), is_top_(is_top) {
139  if (is_top) {
140  y_min_ = vector->endpt().y();
141  y_max_ = vector->extended_ymax();
142  } else {
143  y_max_ = vector->startpt().y();
144  y_min_ = vector->extended_ymin();
145  }
146 }
147 
148 // Get the max of the mins and the min of the maxes.
149 void TabConstraint::GetConstraints(TabConstraint_LIST* constraints,
150  int* y_min, int* y_max) {
151  TabConstraint_IT it(constraints);
152  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
153  TabConstraint* constraint = it.data();
154  if (textord_debug_tabfind > 3) {
155  tprintf("Constraint is [%d,%d]", constraint->y_min_, constraint->y_max_);
156  constraint->vector_->Print(" for");
157  }
158  *y_min = std::max(*y_min, constraint->y_min_);
159  *y_max = std::min(*y_max, constraint->y_max_);
160  }
161 }
162 
163 ELIST2IZE(TabVector)
164 CLISTIZE(TabVector)
165 
166 // The constructor is private. See the bottom of the file...
167 
168 
169 // Public factory to build a TabVector from a list of boxes.
170 // The TabVector will be of the given alignment type.
171 // The input vertical vector is used in fitting, and the output
172 // vertical_x, vertical_y have the resulting line vector added to them
173 // if the alignment is not ragged.
174 // The extended_start_y and extended_end_y are the maximum possible
175 // extension to the line segment that can be used to align with others.
176 // The input CLIST of BLOBNBOX good_points is consumed and taken over.
177 TabVector* TabVector::FitVector(TabAlignment alignment, ICOORD vertical,
178  int extended_start_y, int extended_end_y,
179  BLOBNBOX_CLIST* good_points,
180  int* vertical_x, int* vertical_y) {
181  TabVector* vector = new TabVector(extended_start_y, extended_end_y,
182  alignment, good_points);
183  if (!vector->Fit(vertical, false)) {
184  delete vector;
185  return nullptr;
186  }
187  if (!vector->IsRagged()) {
188  vertical = vector->endpt_ - vector->startpt_;
189  int weight = vector->BoxCount();
190  *vertical_x += vertical.x() * weight;
191  *vertical_y += vertical.y() * weight;
192  }
193  return vector;
194 }
195 
196 // Build a ragged TabVector by copying another's direction, shifting it
197 // to match the given blob, and making its initial extent the height
198 // of the blob, but its extended bounds from the bounds of the original.
200  const ICOORD& vertical_skew, BLOBNBOX* blob)
201  : extended_ymin_(src.extended_ymin_), extended_ymax_(src.extended_ymax_),
202  sort_key_(0), percent_score_(0), mean_width_(0),
203  needs_refit_(true), needs_evaluation_(true), intersects_other_lines_(false),
204  alignment_(alignment),
205  top_constraints_(nullptr), bottom_constraints_(nullptr) {
206  BLOBNBOX_C_IT it(&boxes_);
207  it.add_to_end(blob);
208  TBOX box = blob->bounding_box();
209  if (IsLeftTab()) {
210  startpt_ = box.botleft();
211  endpt_ = box.topleft();
212  } else {
213  startpt_ = box.botright();
214  endpt_ = box.topright();
215  }
216  sort_key_ = SortKey(vertical_skew,
217  (startpt_.x() + endpt_.x()) / 2,
218  (startpt_.y() + endpt_.y()) / 2);
219  if (textord_debug_tabfind > 3)
220  Print("Constructed a new tab vector:");
221 }
222 
223 // Copies basic attributes of a tab vector for simple operations.
224 // Copies things such startpt, endpt, range.
225 // Does not copy things such as partners, boxes, or constraints.
226 // This is useful if you only need vector information for processing, such
227 // as in the table detection code.
229  TabVector* copy = new TabVector();
230  copy->startpt_ = startpt_;
231  copy->endpt_ = endpt_;
232  copy->alignment_ = alignment_;
233  copy->extended_ymax_ = extended_ymax_;
234  copy->extended_ymin_ = extended_ymin_;
235  copy->intersects_other_lines_ = intersects_other_lines_;
236  return copy;
237 }
238 
239 // Extend this vector to include the supplied blob if it doesn't
240 // already have it.
242  TBOX new_box = new_blob->bounding_box();
243  BLOBNBOX_C_IT it(&boxes_);
244  if (!it.empty()) {
245  BLOBNBOX* blob = it.data();
246  TBOX box = blob->bounding_box();
247  while (!it.at_last() && box.top() <= new_box.top()) {
248  if (blob == new_blob)
249  return; // We have it already.
250  it.forward();
251  blob = it.data();
252  box = blob->bounding_box();
253  }
254  if (box.top() >= new_box.top()) {
255  it.add_before_stay_put(new_blob);
256  needs_refit_ = true;
257  return;
258  }
259  }
260  needs_refit_ = true;
261  it.add_after_stay_put(new_blob);
262 }
263 
264 // Set the ycoord of the start and move the xcoord to match.
265 void TabVector::SetYStart(int start_y) {
266  startpt_.set_x(XAtY(start_y));
267  startpt_.set_y(start_y);
268 }
269 // Set the ycoord of the end and move the xcoord to match.
270 void TabVector::SetYEnd(int end_y) {
271  endpt_.set_x(XAtY(end_y));
272  endpt_.set_y(end_y);
273 }
274 
275 // Rotate the ends by the given vector. Auto flip start and end if needed.
276 void TabVector::Rotate(const FCOORD& rotation) {
277  startpt_.rotate(rotation);
278  endpt_.rotate(rotation);
279  int dx = endpt_.x() - startpt_.x();
280  int dy = endpt_.y() - startpt_.y();
281  if ((dy < 0 && abs(dy) > abs(dx)) || (dx < 0 && abs(dx) > abs(dy))) {
282  // Need to flip start/end.
283  ICOORD tmp = startpt_;
284  startpt_ = endpt_;
285  endpt_ = tmp;
286  }
287 }
288 
289 // Setup the initial constraints, being the limits of
290 // the vector and the extended ends.
292  TabConstraint::CreateConstraint(this, false);
294 }
295 
296 // Setup the constraints between the partners of this TabVector.
298  // With the first and last partner, we want a common bottom and top,
299  // respectively, and for each change of partner, we want a common
300  // top of first with bottom of next.
301  TabVector_C_IT it(&partners_);
302  TabVector* prev_partner = nullptr;
303  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
304  TabVector* partner = it.data();
305  if (partner->top_constraints_ == nullptr ||
306  partner->bottom_constraints_ == nullptr) {
307  partner->Print("Impossible: has no constraints");
308  Print("This vector has it as a partner");
309  continue;
310  }
311  if (prev_partner == nullptr) {
312  // This is the first partner, so common bottom.
313  if (TabConstraint::CompatibleConstraints(bottom_constraints_,
314  partner->bottom_constraints_))
315  TabConstraint::MergeConstraints(bottom_constraints_,
316  partner->bottom_constraints_);
317  } else {
318  // We need prev top to be common with partner bottom.
319  if (TabConstraint::CompatibleConstraints(prev_partner->top_constraints_,
320  partner->bottom_constraints_))
321  TabConstraint::MergeConstraints(prev_partner->top_constraints_,
322  partner->bottom_constraints_);
323  }
324  prev_partner = partner;
325  if (it.at_last()) {
326  // This is the last partner, so common top.
327  if (TabConstraint::CompatibleConstraints(top_constraints_,
328  partner->top_constraints_))
329  TabConstraint::MergeConstraints(top_constraints_,
330  partner->top_constraints_);
331  }
332  }
333 }
334 
335 // Setup the constraints between this and its partner.
337  if (TabConstraint::CompatibleConstraints(bottom_constraints_,
338  partner->bottom_constraints_))
339  TabConstraint::MergeConstraints(bottom_constraints_,
340  partner->bottom_constraints_);
341  if (TabConstraint::CompatibleConstraints(top_constraints_,
342  partner->top_constraints_))
343  TabConstraint::MergeConstraints(top_constraints_,
344  partner->top_constraints_);
345 }
346 
347 // Use the constraints to modify the top and bottom.
349  if (top_constraints_ != nullptr)
350  TabConstraint::ApplyConstraints(top_constraints_);
351  if (bottom_constraints_ != nullptr)
352  TabConstraint::ApplyConstraints(bottom_constraints_);
353 }
354 
355 // Merge close tab vectors of the same side that overlap.
357  TabVector_LIST* vectors,
358  BlobGrid* grid) {
359  TabVector_IT it1(vectors);
360  for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
361  TabVector* v1 = it1.data();
362  TabVector_IT it2(it1);
363  for (it2.forward(); !it2.at_first(); it2.forward()) {
364  TabVector* v2 = it2.data();
365  if (v2->SimilarTo(vertical, *v1, grid)) {
366  // Merge into the forward one, in case the combined vector now
367  // overlaps one in between.
368  if (textord_debug_tabfind) {
369  v2->Print("Merging");
370  v1->Print("by deleting");
371  }
372  v2->MergeWith(vertical, it1.extract());
373  if (textord_debug_tabfind) {
374  v2->Print("Producing");
375  }
376  ICOORD merged_vector = v2->endpt();
377  merged_vector -= v2->startpt();
378  if (textord_debug_tabfind && abs(merged_vector.x()) > 100) {
379  v2->Print("Garbage result of merge?");
380  }
381  break;
382  }
383  }
384  }
385 }
386 
387 // Return true if this vector is the same side, overlaps, and close
388 // enough to the other to be merged.
389 bool TabVector::SimilarTo(const ICOORD& vertical,
390  const TabVector& other, BlobGrid* grid) const {
391  if ((IsRightTab() && other.IsRightTab()) ||
392  (IsLeftTab() && other.IsLeftTab())) {
393  // If they don't overlap, at least in extensions, then there is no chance.
394  if (ExtendedOverlap(other.extended_ymax_, other.extended_ymin_) < 0)
395  return false;
396  // A fast approximation to the scale factor of the sort_key_.
397  int v_scale = abs(vertical.y());
398  if (v_scale == 0)
399  v_scale = 1;
400  // If they are close enough, then OK.
401  if (sort_key_ + kSimilarVectorDist * v_scale >= other.sort_key_ &&
402  sort_key_ - kSimilarVectorDist * v_scale <= other.sort_key_)
403  return true;
404  // Ragged tabs get a bigger threshold.
405  if (!IsRagged() || !other.IsRagged() ||
406  sort_key_ + kSimilarRaggedDist * v_scale < other.sort_key_ ||
407  sort_key_ - kSimilarRaggedDist * v_scale > other.sort_key_)
408  return false;
409  if (grid == nullptr) {
410  // There is nothing else to test!
411  return true;
412  }
413  // If there is nothing in the rectangle between the vector that is going to
414  // move, and the place it is moving to, then they can be merged.
415  // Setup a vertical search for any blob.
416  const TabVector* mover = (IsRightTab() &&
417  sort_key_ < other.sort_key_) ? this : &other;
418  int top_y = mover->endpt_.y();
419  int bottom_y = mover->startpt_.y();
420  int left = std::min(mover->XAtY(top_y), mover->XAtY(bottom_y));
421  int right = std::max(mover->XAtY(top_y), mover->XAtY(bottom_y));
422  int shift = abs(sort_key_ - other.sort_key_) / v_scale;
423  if (IsRightTab()) {
424  right += shift;
425  } else {
426  left -= shift;
427  }
428 
430  vsearch.StartVerticalSearch(left, right, top_y);
431  BLOBNBOX* blob;
432  while ((blob = vsearch.NextVerticalSearch(true)) != nullptr) {
433  const TBOX& box = blob->bounding_box();
434  if (box.top() > bottom_y)
435  return true; // Nothing found.
436  if (box.bottom() < top_y)
437  continue; // Doesn't overlap.
438  int left_at_box = XAtY(box.bottom());
439  int right_at_box = left_at_box;
440  if (IsRightTab())
441  right_at_box += shift;
442  else
443  left_at_box -= shift;
444  if (std::min(right_at_box, static_cast<int>(box.right())) > std::max(left_at_box, static_cast<int>(box.left())))
445  return false;
446  }
447  return true; // Nothing found.
448  }
449  return false;
450 }
451 
452 // Eat the other TabVector into this and delete it.
453 void TabVector::MergeWith(const ICOORD& vertical, TabVector* other) {
454  extended_ymin_ = std::min(extended_ymin_, other->extended_ymin_);
455  extended_ymax_ = std::max(extended_ymax_, other->extended_ymax_);
456  if (other->IsRagged()) {
457  alignment_ = other->alignment_;
458  }
459  // Merge sort the two lists of boxes.
460  BLOBNBOX_C_IT it1(&boxes_);
461  BLOBNBOX_C_IT it2(&other->boxes_);
462  while (!it2.empty()) {
463  BLOBNBOX* bbox2 = it2.extract();
464  it2.forward();
465  TBOX box2 = bbox2->bounding_box();
466  BLOBNBOX* bbox1 = it1.data();
467  TBOX box1 = bbox1->bounding_box();
468  while (box1.bottom() < box2.bottom() && !it1.at_last()) {
469  it1.forward();
470  bbox1 = it1.data();
471  box1 = bbox1->bounding_box();
472  }
473  if (box1.bottom() < box2.bottom()) {
474  it1.add_to_end(bbox2);
475  } else if (bbox1 != bbox2) {
476  it1.add_before_stay_put(bbox2);
477  }
478  }
479  Fit(vertical, true);
480  other->Delete(this);
481 }
482 
483 // Add a new element to the list of partner TabVectors.
484 // Partners must be added in order of increasing y coordinate of the text line
485 // that makes them partners.
486 // Groups of identical partners are merged into one.
488  if (IsSeparator() || partner->IsSeparator())
489  return;
490  TabVector_C_IT it(&partners_);
491  if (!it.empty()) {
492  it.move_to_last();
493  if (it.data() == partner)
494  return;
495  }
496  it.add_after_then_move(partner);
497 }
498 
499 // Return true if other is a partner of this.
500 bool TabVector::IsAPartner(const TabVector* other) {
501  TabVector_C_IT it(&partners_);
502  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
503  if (it.data() == other)
504  return true;
505  }
506  return false;
507 }
508 
509 // These names must be synced with the TabAlignment enum in tabvector.h.
510 const char* kAlignmentNames[] = {
511  "Left Aligned",
512  "Left Ragged",
513  "Center",
514  "Right Aligned",
515  "Right Ragged",
516  "Separator"
517 };
518 
519 // Print basic information about this tab vector.
520 void TabVector::Print(const char* prefix) {
521  tprintf(
522  "%s %s (%d,%d)->(%d,%d) w=%d s=%d, sort key=%d, boxes=%d,"
523  " partners=%d\n",
524  prefix, kAlignmentNames[alignment_], startpt_.x(), startpt_.y(),
525  endpt_.x(), endpt_.y(), mean_width_, percent_score_, sort_key_,
526  boxes_.length(), partners_.length());
527 }
528 
529 // Print basic information about this tab vector and every box in it.
530 void TabVector::Debug(const char* prefix) {
531  Print(prefix);
532  BLOBNBOX_C_IT it(&boxes_);
533  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
534  BLOBNBOX* bbox = it.data();
535  const TBOX& box = bbox->bounding_box();
536  tprintf("Box at (%d,%d)->(%d,%d)\n",
537  box.left(), box.bottom(), box.right(), box.top());
538  }
539 }
540 
541 // Draw this tabvector in place in the given window.
543 #ifndef GRAPHICS_DISABLED
545  tab_win->Pen(ScrollView::BLUE);
546  else if (alignment_ == TA_LEFT_ALIGNED)
547  tab_win->Pen(ScrollView::LIME_GREEN);
548  else if (alignment_ == TA_LEFT_RAGGED)
549  tab_win->Pen(ScrollView::DARK_GREEN);
550  else if (alignment_ == TA_RIGHT_ALIGNED)
551  tab_win->Pen(ScrollView::PINK);
552  else if (alignment_ == TA_RIGHT_RAGGED)
553  tab_win->Pen(ScrollView::CORAL);
554  else
555  tab_win->Pen(ScrollView::WHITE);
556  tab_win->Line(startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y());
557  tab_win->Pen(ScrollView::GREY);
558  tab_win->Line(startpt_.x(), startpt_.y(), startpt_.x(), extended_ymin_);
559  tab_win->Line(endpt_.x(), extended_ymax_, endpt_.x(), endpt_.y());
560  char score_buf[64];
561  snprintf(score_buf, sizeof(score_buf), "%d", percent_score_);
562  tab_win->TextAttributes("Times", 50, false, false, false);
563  tab_win->Text(startpt_.x(), startpt_.y(), score_buf);
564 #endif
565 }
566 
567 // Refit the line and/or re-evaluate the vector if the dirty flags are set.
569  TabFind* finder) {
570  if (needs_refit_)
571  Fit(vertical, true);
572  if (needs_evaluation_)
573  Evaluate(vertical, finder);
574 }
575 
576 // Evaluate the vector in terms of coverage of its length by good-looking
577 // box edges. A good looking box is one where its nearest neighbour on the
578 // inside is nearer than half the distance its nearest neighbour on the
579 // outside of the putative column. Bad boxes are removed from the line.
580 // A second pass then further filters boxes by requiring that the gutter
581 // width be a minimum fraction of the mean gutter along the line.
582 void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
583  bool debug = false;
584  needs_evaluation_ = false;
585  int length = endpt_.y() - startpt_.y();
586  if (length == 0 || boxes_.empty()) {
587  percent_score_ = 0;
588  Print("Zero length in evaluate");
589  return;
590  }
591  // Compute the mean box height.
592  BLOBNBOX_C_IT it(&boxes_);
593  int mean_height = 0;
594  int height_count = 0;
595  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
596  BLOBNBOX* bbox = it.data();
597  const TBOX& box = bbox->bounding_box();
598  int height = box.height();
599  mean_height += height;
600  ++height_count;
601  }
602  if (height_count > 0) mean_height /= height_count;
603  int max_gutter = kGutterMultiple * mean_height;
604  if (IsRagged()) {
605  // Ragged edges face a tougher test in that the gap must always be within
606  // the height of the blob.
607  max_gutter = kGutterToNeighbourRatio * mean_height;
608  }
609 
610  STATS gutters(0, max_gutter + 1);
611  // Evaluate the boxes for their goodness, calculating the coverage as we go.
612  // Remove boxes that are not good and shorten the list to the first and
613  // last good boxes.
614  int num_deleted_boxes = 0;
615  bool text_on_image = false;
616  int good_length = 0;
617  const TBOX* prev_good_box = nullptr;
618  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
619  BLOBNBOX* bbox = it.data();
620  const TBOX& box = bbox->bounding_box();
621  int mid_y = (box.top() + box.bottom()) / 2;
622  if (TabFind::WithinTestRegion(2, XAtY(box.bottom()), box.bottom())) {
623  if (!debug) {
624  tprintf("After already deleting %d boxes, ", num_deleted_boxes);
625  Print("Starting evaluation");
626  }
627  debug = true;
628  }
629  // A good box is one where the nearest neighbour on the inside is closer
630  // than half the distance to the nearest neighbour on the outside
631  // (of the putative column).
632  bool left = IsLeftTab();
633  int tab_x = XAtY(mid_y);
634  int gutter_width;
635  int neighbour_gap;
636  finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
637  bbox, &gutter_width, &neighbour_gap);
638  if (debug) {
639  tprintf("Box (%d,%d)->(%d,%d) has gutter %d, ndist %d\n",
640  box.left(), box.bottom(), box.right(), box.top(),
641  gutter_width, neighbour_gap);
642  }
643  // Now we can make the test.
644  if (neighbour_gap * kGutterToNeighbourRatio <= gutter_width) {
645  // A good box contributes its height to the good_length.
646  good_length += box.top() - box.bottom();
647  gutters.add(gutter_width, 1);
648  // Two good boxes together contribute the gap between them
649  // to the good_length as well, as long as the gap is not
650  // too big.
651  if (prev_good_box != nullptr) {
652  int vertical_gap = box.bottom() - prev_good_box->top();
653  double size1 = sqrt(static_cast<double>(prev_good_box->area()));
654  double size2 = sqrt(static_cast<double>(box.area()));
655  if (vertical_gap < kMaxFillinMultiple * std::min(size1, size2))
656  good_length += vertical_gap;
657  if (debug) {
658  tprintf("Box and prev good, gap=%d, target %g, goodlength=%d\n",
659  vertical_gap, kMaxFillinMultiple * std::min(size1, size2),
660  good_length);
661  }
662  } else {
663  // Adjust the start to the first good box.
664  SetYStart(box.bottom());
665  }
666  prev_good_box = &box;
667  if (bbox->flow() == BTFT_TEXT_ON_IMAGE)
668  text_on_image = true;
669  } else {
670  // Get rid of boxes that are not good.
671  if (debug) {
672  tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, ndist %d\n",
673  box.left(), box.bottom(), box.right(), box.top(),
674  gutter_width, neighbour_gap);
675  }
676  it.extract();
677  ++num_deleted_boxes;
678  }
679  }
680  if (debug) {
681  Print("Evaluating:");
682  }
683  // If there are any good boxes, do it again, except this time get rid of
684  // boxes that have a gutter that is a small fraction of the mean gutter.
685  // This filters out ends that run into a coincidental gap in the text.
686  int search_top = endpt_.y();
687  int search_bottom = startpt_.y();
688  int median_gutter = IntCastRounded(gutters.median());
689  if (gutters.get_total() > 0) {
690  prev_good_box = nullptr;
691  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
692  BLOBNBOX* bbox = it.data();
693  const TBOX& box = bbox->bounding_box();
694  int mid_y = (box.top() + box.bottom()) / 2;
695  // A good box is one where the gutter width is at least some constant
696  // fraction of the mean gutter width.
697  bool left = IsLeftTab();
698  int tab_x = XAtY(mid_y);
699  int max_gutter = kGutterMultiple * mean_height;
700  if (IsRagged()) {
701  // Ragged edges face a tougher test in that the gap must always be
702  // within the height of the blob.
703  max_gutter = kGutterToNeighbourRatio * mean_height;
704  }
705  int gutter_width;
706  int neighbour_gap;
707  finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
708  bbox, &gutter_width, &neighbour_gap);
709  // Now we can make the test.
710  if (gutter_width >= median_gutter * kMinGutterFraction) {
711  if (prev_good_box == nullptr) {
712  // Adjust the start to the first good box.
713  SetYStart(box.bottom());
714  search_bottom = box.top();
715  }
716  prev_good_box = &box;
717  search_top = box.bottom();
718  } else {
719  // Get rid of boxes that are not good.
720  if (debug) {
721  tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, mean gutter %d\n",
722  box.left(), box.bottom(), box.right(), box.top(),
723  gutter_width, median_gutter);
724  }
725  it.extract();
726  ++num_deleted_boxes;
727  }
728  }
729  }
730  // If there has been a good box, adjust the end.
731  if (prev_good_box != nullptr) {
732  SetYEnd(prev_good_box->top());
733  // Compute the percentage of the vector that is occupied by good boxes.
734  int length = endpt_.y() - startpt_.y();
735  percent_score_ = 100 * good_length / length;
736  if (num_deleted_boxes > 0) {
737  needs_refit_ = true;
738  FitAndEvaluateIfNeeded(vertical, finder);
739  if (boxes_.empty())
740  return;
741  }
742  // Test the gutter over the whole vector, instead of just at the boxes.
743  int required_shift;
744  if (search_bottom > search_top) {
745  search_bottom = startpt_.y();
746  search_top = endpt_.y();
747  }
748  double min_gutter_width = kLineCountReciprocal / boxes_.length();
749  min_gutter_width += IsRagged() ? kMinRaggedGutter : kMinAlignedGutter;
750  min_gutter_width *= mean_height;
751  int max_gutter_width = IntCastRounded(min_gutter_width) + 1;
752  if (median_gutter > max_gutter_width)
753  max_gutter_width = median_gutter;
754  int gutter_width = finder->GutterWidth(search_bottom, search_top, *this,
755  text_on_image, max_gutter_width,
756  &required_shift);
757  if (gutter_width < min_gutter_width) {
758  if (debug) {
759  tprintf("Rejecting bad tab Vector with %d gutter vs %g min\n",
760  gutter_width, min_gutter_width);
761  }
762  boxes_.shallow_clear();
763  percent_score_ = 0;
764  } else if (debug) {
765  tprintf("Final gutter %d, vs limit of %g, required shift = %d\n",
766  gutter_width, min_gutter_width, required_shift);
767  }
768  } else {
769  // There are no good boxes left, so score is 0.
770  percent_score_ = 0;
771  }
772 
773  if (debug) {
774  Print("Evaluation complete:");
775  }
776 }
777 
778 // (Re)Fit a line to the stored points. Returns false if the line
779 // is degenerate. Althougth the TabVector code mostly doesn't care about the
780 // direction of lines, XAtY would give silly results for a horizontal line.
781 // The class is mostly aimed at use for vertical lines representing
782 // horizontal tab stops.
783 bool TabVector::Fit(ICOORD vertical, bool force_parallel) {
784  needs_refit_ = false;
785  if (boxes_.empty()) {
786  // Don't refit something with no boxes, as that only happens
787  // in Evaluate, and we don't want to end up with a zero vector.
788  if (!force_parallel)
789  return false;
790  // If we are forcing parallel, then we just need to set the sort_key_.
791  ICOORD midpt = startpt_;
792  midpt += endpt_;
793  midpt /= 2;
794  sort_key_ = SortKey(vertical, midpt.x(), midpt.y());
795  return startpt_.y() != endpt_.y();
796  }
797  if (!force_parallel && !IsRagged()) {
798  // Use a fitted line as the vertical.
799  DetLineFit linepoints;
800  BLOBNBOX_C_IT it(&boxes_);
801  // Fit a line to all the boxes in the list.
802  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
803  BLOBNBOX* bbox = it.data();
804  const TBOX& box = bbox->bounding_box();
805  int x1 = IsRightTab() ? box.right() : box.left();
806  ICOORD boxpt(x1, box.bottom());
807  linepoints.Add(boxpt);
808  if (it.at_last()) {
809  ICOORD top_pt(x1, box.top());
810  linepoints.Add(top_pt);
811  }
812  }
813  linepoints.Fit(&startpt_, &endpt_);
814  if (startpt_.y() != endpt_.y()) {
815  vertical = endpt_;
816  vertical -= startpt_;
817  }
818  }
819  int start_y = startpt_.y();
820  int end_y = endpt_.y();
821  sort_key_ = IsLeftTab() ? INT32_MAX : -INT32_MAX;
822  BLOBNBOX_C_IT it(&boxes_);
823  // Choose a line parallel to the vertical such that all boxes are on the
824  // correct side of it.
825  mean_width_ = 0;
826  int width_count = 0;
827  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
828  BLOBNBOX* bbox = it.data();
829  const TBOX& box = bbox->bounding_box();
830  mean_width_ += box.width();
831  ++width_count;
832  int x1 = IsRightTab() ? box.right() : box.left();
833  // Test both the bottom and the top, as one will be more extreme, depending
834  // on the direction of skew.
835  int bottom_y = box.bottom();
836  int top_y = box.top();
837  int key = SortKey(vertical, x1, bottom_y);
838  if (IsLeftTab() == (key < sort_key_)) {
839  sort_key_ = key;
840  startpt_ = ICOORD(x1, bottom_y);
841  }
842  key = SortKey(vertical, x1, top_y);
843  if (IsLeftTab() == (key < sort_key_)) {
844  sort_key_ = key;
845  startpt_ = ICOORD(x1, top_y);
846  }
847  if (it.at_first())
848  start_y = bottom_y;
849  if (it.at_last())
850  end_y = top_y;
851  }
852  if (width_count > 0) {
853  mean_width_ = (mean_width_ + width_count - 1) / width_count;
854  }
855  endpt_ = startpt_ + vertical;
856  needs_evaluation_ = true;
857  if (start_y != end_y) {
858  // Set the ends of the vector to fully include the first and last blobs.
859  startpt_.set_x(XAtY(vertical, sort_key_, start_y));
860  startpt_.set_y(start_y);
861  endpt_.set_x(XAtY(vertical, sort_key_, end_y));
862  endpt_.set_y(end_y);
863  return true;
864  }
865  return false;
866 }
867 
868 // Returns the singleton partner if there is one, or nullptr otherwise.
870  if (!partners_.singleton())
871  return nullptr;
872  TabVector_C_IT partner_it(&partners_);
873  TabVector* partner = partner_it.data();
874  return partner;
875 }
876 
877 // Return the partner of this TabVector if the vector qualifies as
878 // being a vertical text line, otherwise nullptr.
880  if (!partners_.singleton())
881  return nullptr;
882  TabVector_C_IT partner_it(&partners_);
883  TabVector* partner = partner_it.data();
884  BLOBNBOX_C_IT box_it1(&boxes_);
885  BLOBNBOX_C_IT box_it2(&partner->boxes_);
886  // Count how many boxes are also in the other list.
887  // At the same time, gather the mean width and median vertical gap.
888  if (textord_debug_tabfind > 1) {
889  Print("Testing for vertical text");
890  partner->Print(" partner");
891  }
892  int num_matched = 0;
893  int num_unmatched = 0;
894  int total_widths = 0;
895  int width = startpt().x() - partner->startpt().x();
896  if (width < 0)
897  width = -width;
898  STATS gaps(0, width * 2);
899  BLOBNBOX* prev_bbox = nullptr;
900  box_it2.mark_cycle_pt();
901  for (box_it1.mark_cycle_pt(); !box_it1.cycled_list(); box_it1.forward()) {
902  BLOBNBOX* bbox = box_it1.data();
903  TBOX box = bbox->bounding_box();
904  if (prev_bbox != nullptr) {
905  gaps.add(box.bottom() - prev_bbox->bounding_box().top(), 1);
906  }
907  while (!box_it2.cycled_list() && box_it2.data() != bbox &&
908  box_it2.data()->bounding_box().bottom() < box.bottom()) {
909  box_it2.forward();
910  }
911  if (!box_it2.cycled_list() && box_it2.data() == bbox &&
912  bbox->region_type() >= BRT_UNKNOWN &&
913  (prev_bbox == nullptr || prev_bbox->region_type() >= BRT_UNKNOWN))
914  ++num_matched;
915  else
916  ++num_unmatched;
917  total_widths += box.width();
918  prev_bbox = bbox;
919  }
920  if (num_unmatched + num_matched == 0) return nullptr;
921  double avg_width = total_widths * 1.0 / (num_unmatched + num_matched);
922  double max_gap = textord_tabvector_vertical_gap_fraction * avg_width;
923  int min_box_match = static_cast<int>((num_matched + num_unmatched) *
925  bool is_vertical = (gaps.get_total() > 0 &&
926  num_matched >= min_box_match &&
927  gaps.median() <= max_gap);
928  if (textord_debug_tabfind > 1) {
929  tprintf("gaps=%d, matched=%d, unmatched=%d, min_match=%d "
930  "median gap=%.2f, width=%.2f max_gap=%.2f Vertical=%s\n",
931  gaps.get_total(), num_matched, num_unmatched, min_box_match,
932  gaps.median(), avg_width, max_gap, is_vertical?"Yes":"No");
933  }
934  return (is_vertical) ? partner : nullptr;
935 }
936 
937 // The constructor is private.
938 TabVector::TabVector(int extended_ymin, int extended_ymax,
939  TabAlignment alignment, BLOBNBOX_CLIST* boxes)
940  : extended_ymin_(extended_ymin), extended_ymax_(extended_ymax),
941  sort_key_(0), percent_score_(0), mean_width_(0),
942  needs_refit_(true), needs_evaluation_(true), alignment_(alignment),
943  top_constraints_(nullptr), bottom_constraints_(nullptr) {
944  BLOBNBOX_C_IT it(&boxes_);
945  it.add_list_after(boxes);
946 }
947 
948 // Delete this, but first, repoint all the partners to point to
949 // replacement. If replacement is nullptr, then partner relationships
950 // are removed.
951 void TabVector::Delete(TabVector* replacement) {
952  TabVector_C_IT it(&partners_);
953  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
954  TabVector* partner = it.data();
955  TabVector_C_IT p_it(&partner->partners_);
956  // If partner already has replacement in its list, then make
957  // replacement null, and just remove this TabVector when we find it.
958  TabVector* partner_replacement = replacement;
959  for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
960  TabVector* p_partner = p_it.data();
961  if (p_partner == partner_replacement) {
962  partner_replacement = nullptr;
963  break;
964  }
965  }
966  // Remove all references to this, and replace with replacement if not nullptr.
967  for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
968  TabVector* p_partner = p_it.data();
969  if (p_partner == this) {
970  p_it.extract();
971  if (partner_replacement != nullptr)
972  p_it.add_before_stay_put(partner_replacement);
973  }
974  }
975  if (partner_replacement != nullptr) {
976  partner_replacement->AddPartner(partner);
977  }
978  }
979  delete this;
980 }
981 
982 
983 } // namespace tesseract.
const ICOORD & topright() const
Definition: rect.h:104
bool IsSeparator() const
Definition: tabvector.h:221
void set_bottom_constraints(TabConstraint_LIST *constraints)
Definition: tabvector.h:167
void GutterWidthAndNeighbourGap(int tab_x, int mean_height, int max_gutter, bool left, BLOBNBOX *bbox, int *gutter_width, int *neighbour_gap)
Definition: tabfind.cpp:209
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:637
bool Fit(ICOORD vertical, bool force_parallel)
Definition: tabvector.cpp:783
void SetYEnd(int end_y)
Definition: tabvector.cpp:270
const ICOORD & startpt() const
Definition: tabvector.h:146
const char * kAlignmentNames[]
Definition: tabvector.cpp:510
bool IsRightTab() const
Definition: tabvector.h:217
static void CreateConstraint(TabVector *vector, bool is_top)
Definition: tabvector.cpp:64
double textord_tabvector_vertical_box_ratio
Definition: tabvector.cpp:59
int XAtY(int y) const
Definition: tabvector.h:189
TabVector * GetSinglePartner()
Definition: tabvector.cpp:869
void set_x(int16_t xin)
rewrite function
Definition: points.h:62
#define double_VAR(name, val, comment)
Definition: params.h:285
int16_t y() const
access_function
Definition: points.h:57
const double kMinGutterFraction
Definition: tabvector.cpp:47
Definition: rect.h:34
static void ApplyConstraints(TabConstraint_LIST *constraints)
Definition: tabvector.cpp:117
void Evaluate(const ICOORD &vertical, TabFind *finder)
Definition: tabvector.cpp:582
BlobTextFlowType flow() const
Definition: blobbox.h:296
static bool WithinTestRegion(int detail_level, int x, int y)
void Debug(const char *prefix)
Definition: tabvector.cpp:530
int extended_ymin() const
Definition: tabvector.h:155
void set_top_constraints(TabConstraint_LIST *constraints)
Definition: tabvector.h:164
bool IsAPartner(const TabVector *other)
Definition: tabvector.cpp:500
Definition: statistc.h:33
static bool CompatibleConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2)
Definition: tabvector.cpp:76
bool IsLeftTab() const
Definition: tabvector.h:213
const int kGutterMultiple
Definition: tabvector.cpp:36
void Rotate(const FCOORD &rotation)
Definition: tabvector.cpp:276
void AddPartner(TabVector *partner)
Definition: tabvector.cpp:487
#define ELIST2IZE(CLASSNAME)
Definition: elst2.h:961
int16_t width() const
Definition: rect.h:115
const double kMinAlignedGutter
Definition: tabvector.cpp:51
bool textord_debug_printable
Definition: alignedblob.cpp:34
int16_t left() const
Definition: rect.h:72
double textord_tabvector_vertical_gap_fraction
Definition: tabvector.cpp:56
int GutterWidth(int bottom_y, int top_y, const TabVector &v, bool ignore_unmergeables, int max_gutter_width, int *required_shift)
Definition: tabfind.cpp:162
int16_t top() const
Definition: rect.h:58
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:51
double median() const
Definition: statistc.cpp:238
integer coordinate
Definition: points.h:32
int16_t x() const
access function
Definition: points.h:53
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:654
BlobRegionType region_type() const
Definition: blobbox.h:284
int textord_debug_tabfind
Definition: alignedblob.cpp:28
int ExtendedOverlap(int top_y, int bottom_y) const
Definition: tabvector.h:208
int IntCastRounded(double x)
Definition: helpers.h:168
static void MergeConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2)
Definition: tabvector.cpp:93
void SetupPartnerConstraints()
Definition: tabvector.cpp:297
void ExtendToBox(BLOBNBOX *blob)
Definition: tabvector.cpp:241
#define ELISTIZE(CLASSNAME)
Definition: elst.h:961
void MergeWith(const ICOORD &vertical, TabVector *other)
Definition: tabvector.cpp:453
const double kMinRaggedGutter
Definition: tabvector.cpp:53
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:75
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:790
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int32_t area() const
Definition: rect.h:122
void rotate(const FCOORD &vec)
Definition: points.h:537
int extended_ymax() const
Definition: tabvector.h:152
void add(int32_t value, int32_t count)
Definition: statistc.cpp:100
const ICOORD & botleft() const
Definition: rect.h:92
const double kLineCountReciprocal
Definition: tabvector.cpp:49
static int SortKey(const ICOORD &vertical, int x, int y)
Definition: tabvector.h:280
ICOORD topleft() const
Definition: rect.h:100
const int kGutterToNeighbourRatio
Definition: tabvector.cpp:38
static void MergeSimilarTabVectors(const ICOORD &vertical, TabVector_LIST *vectors, BlobGrid *grid)
Definition: tabvector.cpp:356
Definition: points.h:189
const TBOX & bounding_box() const
Definition: blobbox.h:231
const int kSimilarRaggedDist
Definition: tabvector.cpp:43
TabVector * VerticalTextlinePartner()
Definition: tabvector.cpp:879
int16_t right() const
Definition: rect.h:79
ICOORD botright() const
Definition: rect.h:96
const int kMaxFillinMultiple
Definition: tabvector.cpp:45
void FitAndEvaluateIfNeeded(const ICOORD &vertical, TabFind *finder)
Definition: tabvector.cpp:568
bool IsRagged() const
Definition: tabvector.h:229
void SetYStart(int start_y)
Definition: tabvector.cpp:265
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:804
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
TabVector * ShallowCopy() const
Definition: tabvector.cpp:228
bool SimilarTo(const ICOORD &vertical, const TabVector &other, BlobGrid *grid) const
Definition: tabvector.cpp:389
const int kSimilarVectorDist
Definition: tabvector.cpp:40
void Pen(Color color)
Definition: scrollview.cpp:722
void set_y(int16_t yin)
rewrite function
Definition: points.h:66
void Display(ScrollView *tab_win)
Definition: tabvector.cpp:542
int16_t bottom() const
Definition: rect.h:65
void Print(const char *prefix)
Definition: tabvector.cpp:520
int16_t height() const
Definition: rect.h:108
int32_t get_total() const
Definition: statistc.h:86
const ICOORD & endpt() const
Definition: tabvector.h:149
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:534