tesseract  5.0.0-alpha-619-ge9db
tabvector.h
Go to the documentation of this file.
1 // File: tabvector.h
3 // Description: Class to hold a near-vertical vector representing a tab-stop.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2008, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_TEXTORD_TABVECTOR_H_
20 #define TESSERACT_TEXTORD_TABVECTOR_H_
21 
22 #include "blobgrid.h"
23 #include "clst.h"
24 #include "elst.h"
25 #include "elst2.h"
26 #include "rect.h"
27 #include "bbgrid.h"
28 
29 #include <algorithm>
30 
31 class BLOBNBOX;
32 class ScrollView;
33 
34 namespace tesseract {
35 
36 
38  "Max fraction of mean blob width allowed for vertical gaps in vertical text");
40  "Fraction of box matches required to declare a line vertical");
41 
42 // The alignment type that a tab vector represents.
43 // Keep this enum synced with kAlignmentNames in tabvector.cpp.
52 };
53 
54 // Forward declarations. The classes use their own list types, so we
55 // need to make the list types first.
56 class TabFind;
57 class TabVector;
58 class TabConstraint;
59 
60 ELIST2IZEH(TabVector)
61 CLISTIZEH(TabVector)
62 ELISTIZEH(TabConstraint)
63 
64 // TabConstraint is a totally self-contained class to maintain
65 // a list of [min,max] constraints, each referring to a TabVector.
66 // The constraints are manipulated through static methods that act
67 // on a list of constraints. The list itself is cooperatively owned
68 // by the TabVectors of the constraints on the list and managed
69 // by implicit reference counting via the elements of the list.
70 class TabConstraint : public ELIST_LINK {
71  public:
72  // This empty constructor is here only so that the class can be ELISTIZED.
73  // TODO(rays) change deep_copy in elst.h line 955 to take a callback copier
74  // and eliminate CLASSNAME##_copier.
75  TabConstraint() = default;
76 
77  // Create a constraint for the top or bottom of this TabVector.
78  static void CreateConstraint(TabVector* vector, bool is_top);
79 
80  // Test to see if the constraints are compatible enough to merge.
81  static bool CompatibleConstraints(TabConstraint_LIST* list1,
82  TabConstraint_LIST* list2);
83 
84  // Merge the lists of constraints and update the TabVector pointers.
85  // The second list is deleted.
86  static void MergeConstraints(TabConstraint_LIST* list1,
87  TabConstraint_LIST* list2);
88 
89  // Set all the tops and bottoms as appropriate to a mean of the
90  // constrained range. Delete all the constraints and list.
91  static void ApplyConstraints(TabConstraint_LIST* constraints);
92 
93  private:
94  TabConstraint(TabVector* vector, bool is_top);
95 
96  // Get the max of the mins and the min of the maxes.
97  static void GetConstraints(TabConstraint_LIST* constraints,
98  int* y_min, int* y_max);
99 
100  // The TabVector this constraint applies to.
101  TabVector* vector_;
102  // If true then we refer to the top of the vector_.
103  bool is_top_;
104  // The allowed range of this vector_.
105  int y_min_;
106  int y_max_;
107 };
108 
109 // Class to hold information about a single vector
110 // that represents a tab stop or a rule line.
111 class TabVector : public ELIST2_LINK {
112  public:
113  // TODO(rays) fix this in elst.h line 1076, where it should use the
114  // copy constructor instead of operator=.
115  TabVector() = default;
116  ~TabVector() = default;
117 
118  // Public factory to build a TabVector from a list of boxes.
119  // The TabVector will be of the given alignment type.
120  // The input vertical vector is used in fitting, and the output
121  // vertical_x, vertical_y have the resulting line vector added to them
122  // if the alignment is not ragged.
123  // The extended_start_y and extended_end_y are the maximum possible
124  // extension to the line segment that can be used to align with others.
125  // The input CLIST of BLOBNBOX good_points is consumed and taken over.
126  static TabVector* FitVector(TabAlignment alignment, ICOORD vertical,
127  int extended_start_y, int extended_end_y,
128  BLOBNBOX_CLIST* good_points,
129  int* vertical_x, int* vertical_y);
130 
131  // Build a ragged TabVector by copying another's direction, shifting it
132  // to match the given blob, and making its initial extent the height
133  // of the blob, but its extended bounds from the bounds of the original.
134  TabVector(const TabVector& src, TabAlignment alignment,
135  const ICOORD& vertical_skew, BLOBNBOX* blob);
136 
137  // Copies basic attributes of a tab vector for simple operations.
138  // Copies things such startpt, endpt, range, width.
139  // Does not copy things such as partners, boxes, or constraints.
140  // This is useful if you only need vector information for processing, such
141  // as in the table detection code.
142  TabVector* ShallowCopy() const;
143 
144  // Simple accessors.
145  const ICOORD& startpt() const {
146  return startpt_;
147  }
148  const ICOORD& endpt() const {
149  return endpt_;
150  }
151  int extended_ymax() const {
152  return extended_ymax_;
153  }
154  int extended_ymin() const {
155  return extended_ymin_;
156  }
157  int sort_key() const {
158  return sort_key_;
159  }
160  int mean_width() const {
161  return mean_width_;
162  }
163  void set_top_constraints(TabConstraint_LIST* constraints) {
164  top_constraints_ = constraints;
165  }
166  void set_bottom_constraints(TabConstraint_LIST* constraints) {
167  bottom_constraints_ = constraints;
168  }
169  TabVector_CLIST* partners() {
170  return &partners_;
171  }
172  void set_startpt(const ICOORD& start) {
173  startpt_ = start;
174  }
175  void set_endpt(const ICOORD& end) {
176  endpt_ = end;
177  }
178  bool intersects_other_lines() const {
179  return intersects_other_lines_;
180  }
181  void set_intersects_other_lines(bool value) {
182  intersects_other_lines_ = value;
183  }
184 
185  // Inline quasi-accessors that require some computation.
186 
187  // Compute the x coordinate at the given y coordinate.
188  int XAtY(int y) const {
189  int height = endpt_.y() - startpt_.y();
190  if (height != 0)
191  return (y - startpt_.y()) * (endpt_.x() - startpt_.x()) / height +
192  startpt_.x();
193  else
194  return startpt_.x();
195  }
196 
197  // Compute the vertical overlap with the other TabVector.
198  int VOverlap(const TabVector& other) const {
199  return std::min(other.endpt_.y(), endpt_.y()) -
200  std::max(other.startpt_.y(), startpt_.y());
201  }
202  // Compute the vertical overlap with the given y bounds.
203  int VOverlap(int top_y, int bottom_y) const {
204  return std::min(top_y, static_cast<int>(endpt_.y())) - std::max(bottom_y, static_cast<int>(startpt_.y()));
205  }
206  // Compute the extended vertical overlap with the given y bounds.
207  int ExtendedOverlap(int top_y, int bottom_y) const {
208  return std::min(top_y, extended_ymax_) - std::max(bottom_y, extended_ymin_);
209  }
210 
211  // Return true if this is a left tab stop, either aligned, or ragged.
212  bool IsLeftTab() const {
213  return alignment_ == TA_LEFT_ALIGNED || alignment_ == TA_LEFT_RAGGED;
214  }
215  // Return true if this is a right tab stop, either aligned, or ragged.
216  bool IsRightTab() const {
217  return alignment_ == TA_RIGHT_ALIGNED || alignment_ == TA_RIGHT_RAGGED;
218  }
219  // Return true if this is a separator.
220  bool IsSeparator() const {
221  return alignment_ == TA_SEPARATOR;
222  }
223  // Return true if this is a center aligned tab stop.
224  bool IsCenterTab() const {
225  return alignment_ == TA_CENTER_JUSTIFIED;
226  }
227  // Return true if this is a ragged tab top, either left or right.
228  bool IsRagged() const {
229  return alignment_ == TA_LEFT_RAGGED || alignment_ == TA_RIGHT_RAGGED;
230  }
231 
232  // Return true if this vector is to the left of the other in terms
233  // of sort_key_.
234  bool IsLeftOf(const TabVector& other) const {
235  return sort_key_ < other.sort_key_;
236  }
237 
238  // Return true if the vector has no partners.
239  bool Partnerless() {
240  return partners_.empty();
241  }
242 
243  // Return the number of tab boxes in this vector.
244  int BoxCount() {
245  return boxes_.length();
246  }
247 
248  // Lock the vector from refits by clearing the boxes_ list.
249  void Freeze() {
250  boxes_.shallow_clear();
251  }
252 
253  // Flip x and y on the ends so a vector can be created from flipped input.
254  void XYFlip() {
255  int x = startpt_.y();
256  startpt_.set_y(startpt_.x());
257  startpt_.set_x(x);
258  x = endpt_.y();
259  endpt_.set_y(endpt_.x());
260  endpt_.set_x(x);
261  }
262 
263  // Reflect the tab vector in the y-axis.
264  void ReflectInYAxis() {
265  startpt_.set_x(-startpt_.x());
266  endpt_.set_x(-endpt_.x());
267  sort_key_ = -sort_key_;
268  if (alignment_ == TA_LEFT_ALIGNED)
269  alignment_ = TA_RIGHT_ALIGNED;
270  else if (alignment_ == TA_RIGHT_ALIGNED)
271  alignment_ = TA_LEFT_ALIGNED;
272  if (alignment_ == TA_LEFT_RAGGED)
273  alignment_ = TA_RIGHT_RAGGED;
274  else if (alignment_ == TA_RIGHT_RAGGED)
275  alignment_ = TA_LEFT_RAGGED;
276  }
277 
278  // Separate function to compute the sort key for a given coordinate pair.
279  static int SortKey(const ICOORD& vertical, int x, int y) {
280  ICOORD pt(x, y);
281  return pt * vertical;
282  }
283 
284  // Return the x at the given y for the given sort key.
285  static int XAtY(const ICOORD& vertical, int sort_key, int y) {
286  if (vertical.y() != 0)
287  return (vertical.x() * y + sort_key) / vertical.y();
288  else
289  return sort_key;
290  }
291 
292  // Sort function for E2LIST::sort to sort by sort_key_.
293  static int SortVectorsByKey(const void* v1, const void* v2) {
294  const TabVector* tv1 = *static_cast<const TabVector* const*>(v1);
295  const TabVector* tv2 = *static_cast<const TabVector* const*>(v2);
296  return tv1->sort_key_ - tv2->sort_key_;
297  }
298 
299  // More complex members.
300 
301  // Extend this vector to include the supplied blob if it doesn't
302  // already have it.
303  void ExtendToBox(BLOBNBOX* blob);
304 
305  // Set the ycoord of the start and move the xcoord to match.
306  void SetYStart(int start_y);
307  // Set the ycoord of the end and move the xcoord to match.
308  void SetYEnd(int end_y);
309 
310  // Rotate the ends by the given vector.
311  void Rotate(const FCOORD& rotation);
312 
313  // Setup the initial constraints, being the limits of
314  // the vector and the extended ends.
315  void SetupConstraints();
316 
317  // Setup the constraints between the partners of this TabVector.
319 
320  // Setup the constraints between this and its partner.
321  void SetupPartnerConstraints(TabVector* partner);
322 
323  // Use the constraints to modify the top and bottom.
324  void ApplyConstraints();
325 
326  // Merge close tab vectors of the same side that overlap.
327  static void MergeSimilarTabVectors(const ICOORD& vertical,
328  TabVector_LIST* vectors, BlobGrid* grid);
329 
330  // Return true if this vector is the same side, overlaps, and close
331  // enough to the other to be merged.
332  bool SimilarTo(const ICOORD& vertical,
333  const TabVector& other, BlobGrid* grid) const;
334 
335  // Eat the other TabVector into this and delete it.
336  void MergeWith(const ICOORD& vertical, TabVector* other);
337 
338  // Add a new element to the list of partner TabVectors.
339  // Partners must be added in order of increasing y coordinate of the text line
340  // that makes them partners.
341  // Groups of identical partners are merged into one.
342  void AddPartner(TabVector* partner);
343 
344  // Return true if other is a partner of this.
345  bool IsAPartner(const TabVector* other);
346 
347  // Print basic information about this tab vector.
348  void Print(const char* prefix);
349 
350  // Print basic information about this tab vector and every box in it.
351  void Debug(const char* prefix);
352 
353  // Draw this tabvector in place in the given window.
354  void Display(ScrollView* tab_win);
355 
356  // Refit the line and/or re-evaluate the vector if the dirty flags are set.
357  void FitAndEvaluateIfNeeded(const ICOORD& vertical, TabFind* finder);
358 
359  // Evaluate the vector in terms of coverage of its length by good-looking
360  // box edges. A good looking box is one where its nearest neighbour on the
361  // inside is nearer than half the distance its nearest neighbour on the
362  // outside of the putative column. Bad boxes are removed from the line.
363  // A second pass then further filters boxes by requiring that the gutter
364  // width be a minimum fraction of the mean gutter along the line.
365  void Evaluate(const ICOORD& vertical, TabFind* finder);
366 
367  // (Re)Fit a line to the stored points. Returns false if the line
368  // is degenerate. Althougth the TabVector code mostly doesn't care about the
369  // direction of lines, XAtY would give silly results for a horizontal line.
370  // The class is mostly aimed at use for vertical lines representing
371  // horizontal tab stops.
372  bool Fit(ICOORD vertical, bool force_parallel);
373 
374  // Return the partner of this TabVector if the vector qualifies as
375  // being a vertical text line, otherwise nullptr.
377 
378  // Return the matching tabvector if there is exactly one partner, or
379  // nullptr otherwise. This can be used after matching is done, eg. by
380  // VerticalTextlinePartner(), without checking if the line is vertical.
382 
383  private:
384  // Constructor is private as the static factory is the external way
385  // to build a TabVector.
387  TabAlignment alignment, BLOBNBOX_CLIST* boxes);
388 
389  // Delete this, but first, repoint all the partners to point to
390  // replacement. If replacement is nullptr, then partner relationships
391  // are removed.
392  void Delete(TabVector* replacement);
393 
394  private:
395  // The bottom of the tab line.
396  ICOORD startpt_;
397  // The top of the tab line.
398  ICOORD endpt_;
399  // The lowest y that the vector might extend to.
400  int extended_ymin_ = 0;
401  // The highest y that the vector might extend to.
402  int extended_ymax_ = 0;
403  // Perpendicular distance of vector from a given vertical for sorting.
404  int sort_key_ = 0;
405  // Result of Evaluate 0-100. Coverage of line with good boxes.
406  int percent_score_ = 0;
407  // The mean width of the blobs. Meaningful only for separator lines.
408  int mean_width_ = 0;
409  // True if the boxes_ list has been modified, so a refit is needed.
410  bool needs_refit_ = false;
411  // True if a fit has been done, so re-evaluation is needed.
412  bool needs_evaluation_ = false;
413  // True if a separator line intersects at least 2 other lines.
414  bool intersects_other_lines_ = false;
415  // The type of this TabVector.
416  TabAlignment alignment_ = TA_LEFT_ALIGNED;
417  // The list of boxes whose edges are aligned at this TabVector.
418  BLOBNBOX_CLIST boxes_;
419  // List of TabVectors that have a connection with this via a text line.
420  TabVector_CLIST partners_;
421  // Constraints used to resolve the exact location of the top and bottom
422  // of the tab line.
423  TabConstraint_LIST* top_constraints_ = nullptr;
424  TabConstraint_LIST* bottom_constraints_ = nullptr;
425 };
426 
427 } // namespace tesseract.
428 
429 #endif // TESSERACT_TEXTORD_TABVECTOR_H_
tesseract::TabVector::SortVectorsByKey
static int SortVectorsByKey(const void *v1, const void *v2)
Definition: tabvector.h:293
tesseract::TabVector::SetYEnd
void SetYEnd(int end_y)
Definition: tabvector.cpp:267
tesseract::TA_LEFT_RAGGED
Definition: tabvector.h:46
ScrollView
Definition: scrollview.h:97
elst.h
ICOORD::set_x
void set_x(int16_t xin)
rewrite function
Definition: points.h:60
tesseract::TabVector::AddPartner
void AddPartner(TabVector *partner)
Definition: tabvector.cpp:484
tesseract::TA_SEPARATOR
Definition: tabvector.h:50
tesseract::TabVector
Definition: tabvector.h:111
tesseract::TabVector::SortKey
static int SortKey(const ICOORD &vertical, int x, int y)
Definition: tabvector.h:279
tesseract::TabVector::FitVector
static TabVector * FitVector(TabAlignment alignment, ICOORD vertical, int extended_start_y, int extended_end_y, BLOBNBOX_CLIST *good_points, int *vertical_x, int *vertical_y)
Definition: tabvector.cpp:176
tesseract::textord_tabvector_vertical_box_ratio
double textord_tabvector_vertical_box_ratio
Definition: tabvector.cpp:58
tesseract::TabVector::MergeWith
void MergeWith(const ICOORD &vertical, TabVector *other)
Definition: tabvector.cpp:450
tesseract::TabVector::endpt
const ICOORD & endpt() const
Definition: tabvector.h:148
tesseract::TabVector::SetupPartnerConstraints
void SetupPartnerConstraints()
Definition: tabvector.cpp:294
bbgrid.h
tesseract::TabVector::Evaluate
void Evaluate(const ICOORD &vertical, TabFind *finder)
Definition: tabvector.cpp:579
tesseract::TabVector::SimilarTo
bool SimilarTo(const ICOORD &vertical, const TabVector &other, BlobGrid *grid) const
Definition: tabvector.cpp:386
tesseract::TabVector::Print
void Print(const char *prefix)
Definition: tabvector.cpp:517
ICOORD
integer coordinate
Definition: points.h:30
tesseract::TabVector::mean_width
int mean_width() const
Definition: tabvector.h:160
tesseract::TabVector::set_endpt
void set_endpt(const ICOORD &end)
Definition: tabvector.h:175
tesseract::BlobGrid
Definition: blobgrid.h:33
tesseract::TabVector::Rotate
void Rotate(const FCOORD &rotation)
Definition: tabvector.cpp:273
tesseract::TA_RIGHT_ALIGNED
Definition: tabvector.h:48
tesseract::TabVector::IsCenterTab
bool IsCenterTab() const
Definition: tabvector.h:224
tesseract::TabVector::Fit
bool Fit(ICOORD vertical, bool force_parallel)
Definition: tabvector.cpp:780
tesseract::TabVector::set_startpt
void set_startpt(const ICOORD &start)
Definition: tabvector.h:172
tesseract::TabVector::ExtendedOverlap
int ExtendedOverlap(int top_y, int bottom_y) const
Definition: tabvector.h:207
tesseract::TabVector::ApplyConstraints
void ApplyConstraints()
Definition: tabvector.cpp:345
tesseract::TabVector::set_intersects_other_lines
void set_intersects_other_lines(bool value)
Definition: tabvector.h:181
rect.h
tesseract::TabVector::ExtendToBox
void ExtendToBox(BLOBNBOX *blob)
Definition: tabvector.cpp:238
ICOORD::x
int16_t x() const
access function
Definition: points.h:51
FCOORD
Definition: points.h:187
BLOBNBOX
Definition: blobbox.h:142
double_VAR_H
#define double_VAR_H(name, val, comment)
Definition: params.h:298
tesseract::TabVector::partners
TabVector_CLIST * partners()
Definition: tabvector.h:169
elst2.h
ICOORD::set_y
void set_y(int16_t yin)
rewrite function
Definition: points.h:64
ELISTIZEH
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:907
tesseract::TabVector::IsLeftTab
bool IsLeftTab() const
Definition: tabvector.h:212
tesseract::TabVector::BoxCount
int BoxCount()
Definition: tabvector.h:244
tesseract::TabVector::XAtY
static int XAtY(const ICOORD &vertical, int sort_key, int y)
Definition: tabvector.h:285
tesseract::TabVector::~TabVector
~TabVector()=default
tesseract::TabVector::sort_key
int sort_key() const
Definition: tabvector.h:157
ELIST2IZEH
#define ELIST2IZEH(CLASSNAME)
Definition: elst2.h:917
tesseract::TabVector::set_top_constraints
void set_top_constraints(TabConstraint_LIST *constraints)
Definition: tabvector.h:163
tesseract::TabVector::set_bottom_constraints
void set_bottom_constraints(TabConstraint_LIST *constraints)
Definition: tabvector.h:166
tesseract::TabVector::Debug
void Debug(const char *prefix)
Definition: tabvector.cpp:527
tesseract::TabVector::extended_ymin
int extended_ymin() const
Definition: tabvector.h:154
CLISTIZEH
CLISTIZEH(STRING) CLISTIZE(STRING) namespace tesseract
Definition: reject.cpp:50
blobgrid.h
tesseract::TA_LEFT_ALIGNED
Definition: tabvector.h:45
tesseract::TabVector::VerticalTextlinePartner
TabVector * VerticalTextlinePartner()
Definition: tabvector.cpp:876
tesseract::TabVector::SetupConstraints
void SetupConstraints()
Definition: tabvector.cpp:288
tesseract::TabVector::XAtY
int XAtY(int y) const
Definition: tabvector.h:188
tesseract::TabVector::ReflectInYAxis
void ReflectInYAxis()
Definition: tabvector.h:264
tesseract
Definition: baseapi.h:65
tesseract::TabVector::intersects_other_lines
bool intersects_other_lines() const
Definition: tabvector.h:178
tesseract::TabConstraint
Definition: tabvector.h:70
tesseract::textord_tabvector_vertical_gap_fraction
double textord_tabvector_vertical_gap_fraction
Definition: tabvector.cpp:55
tesseract::TabVector::GetSinglePartner
TabVector * GetSinglePartner()
Definition: tabvector.cpp:866
tesseract::TabVector::VOverlap
int VOverlap(const TabVector &other) const
Definition: tabvector.h:198
tesseract::TabVector::MergeSimilarTabVectors
static void MergeSimilarTabVectors(const ICOORD &vertical, TabVector_LIST *vectors, BlobGrid *grid)
Definition: tabvector.cpp:353
tesseract::TabVector::IsRightTab
bool IsRightTab() const
Definition: tabvector.h:216
tesseract::TabVector::Freeze
void Freeze()
Definition: tabvector.h:249
tesseract::TabVector::IsLeftOf
bool IsLeftOf(const TabVector &other) const
Definition: tabvector.h:234
tesseract::TabVector::TabVector
TabVector()=default
tesseract::TabFind
Definition: tabfind.h:52
tesseract::TabVector::FitAndEvaluateIfNeeded
void FitAndEvaluateIfNeeded(const ICOORD &vertical, TabFind *finder)
Definition: tabvector.cpp:565
tesseract::TabVector::Display
void Display(ScrollView *tab_win)
Definition: tabvector.cpp:539
tesseract::TabVector::startpt
const ICOORD & startpt() const
Definition: tabvector.h:145
tesseract::TabVector::XYFlip
void XYFlip()
Definition: tabvector.h:254
ELIST2_LINK
Definition: elst2.h:53
ELIST_LINK
Definition: elst.h:74
tesseract::TabVector::IsAPartner
bool IsAPartner(const TabVector *other)
Definition: tabvector.cpp:497
tesseract::TabVector::extended_ymax
int extended_ymax() const
Definition: tabvector.h:151
tesseract::TA_CENTER_JUSTIFIED
Definition: tabvector.h:47
tesseract::TabAlignment
TabAlignment
Definition: tabvector.h:44
tesseract::TA_COUNT
Definition: tabvector.h:51
tesseract::TabVector::IsSeparator
bool IsSeparator() const
Definition: tabvector.h:220
tesseract::TabVector::ShallowCopy
TabVector * ShallowCopy() const
Definition: tabvector.cpp:225
tesseract::TA_RIGHT_RAGGED
Definition: tabvector.h:49
tesseract::TabVector::Partnerless
bool Partnerless()
Definition: tabvector.h:239
tesseract::TabVector::SetYStart
void SetYStart(int start_y)
Definition: tabvector.cpp:262
tesseract::TabVector::IsRagged
bool IsRagged() const
Definition: tabvector.h:228
tesseract::TabVector::VOverlap
int VOverlap(int top_y, int bottom_y) const
Definition: tabvector.h:203
clst.h
ICOORD::y
int16_t y() const
access_function
Definition: points.h:55