tesseract  5.0.0-alpha-619-ge9db
pageiterator.h
Go to the documentation of this file.
1 // File: pageiterator.h
3 // Description: Iterator for tesseract page structure that avoids using
4 // tesseract internal data structures.
5 // Author: Ray Smith
6 // Created: Fri Feb 26 11:01:06 PST 2010
7 //
8 // (C) Copyright 2010, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
22 #define TESSERACT_CCMAIN_PAGEITERATOR_H_
23 
24 #include "platform.h"
25 #include "publictypes.h"
26 
27 struct BlamerBundle;
28 class C_BLOB_IT;
29 class PAGE_RES;
30 class PAGE_RES_IT;
31 class WERD;
32 struct Pix;
33 struct Pta;
34 
35 namespace tesseract {
36 
37 class Tesseract;
38 
53  public:
68  PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale,
69  int scaled_yres, int rect_left, int rect_top, int rect_width,
70  int rect_height);
71  virtual ~PageIterator();
72 
79  PageIterator(const PageIterator& src);
80  const PageIterator& operator=(const PageIterator& src);
81 
83  bool PositionedAtSameWord(const PAGE_RES_IT* other) const;
84 
85  // ============= Moving around within the page ============.
86 
91  virtual void Begin();
92 
98  virtual void RestartParagraph();
99 
104  bool IsWithinFirstTextlineOfParagraph() const;
105 
111  virtual void RestartRow();
112 
124  virtual bool Next(PageIteratorLevel level);
125 
139  virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
140 
157  virtual bool IsAtFinalElement(PageIteratorLevel level,
158  PageIteratorLevel element) const;
159 
166  int Cmp(const PageIterator& other) const;
167 
168  // ============= Accessing data ==============.
169  // Coordinate system:
170  // Integer coordinates are at the cracks between the pixels.
171  // The top-left corner of the top-left pixel in the image is at (0,0).
172  // The bottom-right corner of the bottom-right pixel in the image is at
173  // (width, height).
174  // Every bounding box goes from the top-left of the top-left contained
175  // pixel to the bottom-right of the bottom-right contained pixel, so
176  // the bounding box of the single top-left pixel in the image is:
177  // (0,0)->(1,1).
178  // If an image rectangle has been set in the API, then returned coordinates
179  // relate to the original (full) image, rather than the rectangle.
180 
190  void SetBoundingBoxComponents(bool include_upper_dots,
191  bool include_lower_dots) {
192  include_upper_dots_ = include_upper_dots;
193  include_lower_dots_ = include_lower_dots;
194  }
195 
205  bool BoundingBox(PageIteratorLevel level, int* left, int* top, int* right,
206  int* bottom) const;
207  bool BoundingBox(PageIteratorLevel level, int padding, int* left, int* top,
208  int* right, int* bottom) const;
214  bool BoundingBoxInternal(PageIteratorLevel level, int* left, int* top,
215  int* right, int* bottom) const;
216 
218  bool Empty(PageIteratorLevel level) const;
219 
224  PolyBlockType BlockType() const;
225 
233  Pta* BlockPolygon() const;
234 
241  Pix* GetBinaryImage(PageIteratorLevel level) const;
242 
254  Pix* GetImage(PageIteratorLevel level, int padding, Pix* original_img,
255  int* left, int* top) const;
256 
263  bool Baseline(PageIteratorLevel level, int* x1, int* y1, int* x2,
264  int* y2) const;
265 
274  void Orientation(tesseract::Orientation* orientation,
275  tesseract::WritingDirection* writing_direction,
276  tesseract::TextlineOrder* textline_order,
277  float* deskew_angle) const;
278 
307  void ParagraphInfo(tesseract::ParagraphJustification* justification,
308  bool* is_list_item, bool* is_crown,
309  int* first_line_indent) const;
310 
311  // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
312  // of the current word to the given pointer (takes ownership of the pointer)
313  // and returns true.
314  // Can only be used when iterating on the word level.
315  bool SetWordBlamerBundle(BlamerBundle* blamer_bundle);
316 
317  protected:
322  TESS_LOCAL void BeginWord(int offset);
323 
347  C_BLOB_IT* cblob_it_;
352  int scale_;
358 };
359 
360 } // namespace tesseract.
361 
362 #endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
tesseract::PageIterator::rect_height_
int rect_height_
Definition: pageiterator.h:357
tesseract::PageIterator::it_
PAGE_RES_IT * it_
Definition: pageiterator.h:332
TESS_LOCAL
#define TESS_LOCAL
Definition: platform.h:55
tesseract::PageIterator::page_res_
PAGE_RES * page_res_
Definition: pageiterator.h:325
tesseract::Tesseract
Definition: tesseractclass.h:172
tesseract::PageIterator
Definition: pageiterator.h:52
tesseract::ParagraphJustification
ParagraphJustification
Definition: publictypes.h:248
platform.h
tesseract::PageIterator::scale_
int scale_
Definition: pageiterator.h:352
tesseract::WritingDirection
WritingDirection
Definition: publictypes.h:131
tesseract::PageIterator::include_lower_dots_
bool include_lower_dots_
Definition: pageiterator.h:350
tesseract::PageIterator::word_length_
int word_length_
Definition: pageiterator.h:339
tesseract::TextlineOrder
TextlineOrder
Definition: publictypes.h:148
tesseract::PageIterator::tesseract_
Tesseract * tesseract_
Definition: pageiterator.h:327
publictypes.h
tesseract::PageIteratorLevel
PageIteratorLevel
Definition: publictypes.h:216
tesseract::PageIterator::word_
WERD * word_
Definition: pageiterator.h:337
tesseract
Definition: baseapi.h:65
PAGE_RES
Definition: pageres.h:73
tesseract::PageIterator::include_upper_dots_
bool include_upper_dots_
Definition: pageiterator.h:349
tesseract::PageIterator::cblob_it_
C_BLOB_IT * cblob_it_
Definition: pageiterator.h:347
PAGE_RES_IT
Definition: pageres.h:668
WERD
Definition: werd.h:55
tesseract::PageIterator::SetBoundingBoxComponents
void SetBoundingBoxComponents(bool include_upper_dots, bool include_lower_dots)
Definition: pageiterator.h:190
tesseract::PageIterator::rect_top_
int rect_top_
Definition: pageiterator.h:355
TESS_API
#define TESS_API
Definition: platform.h:54
tesseract::PageIterator::scaled_yres_
int scaled_yres_
Definition: pageiterator.h:353
PolyBlockType
PolyBlockType
Definition: publictypes.h:52
tesseract::Orientation
Orientation
Definition: publictypes.h:116
BlamerBundle
Definition: blamer.h:103
tesseract::PageIterator::rect_width_
int rect_width_
Definition: pageiterator.h:356
tesseract::PageIterator::rect_left_
int rect_left_
Definition: pageiterator.h:354
tesseract::PageIterator::blob_index_
int blob_index_
Definition: pageiterator.h:341