tesseract  4.0.0-1-g2a2b
tesseract::ResultIterator Class Reference

#include <resultiterator.h>

Inheritance diagram for tesseract::ResultIterator:
tesseract::LTRResultIterator tesseract::PageIterator tesseract::MutableIterator

Public Member Functions

virtual ~ResultIterator ()=default
 
virtual void Begin ()
 
virtual bool Next (PageIteratorLevel level)
 
virtual bool IsAtBeginningOf (PageIteratorLevel level) const
 
virtual bool IsAtFinalElement (PageIteratorLevel level, PageIteratorLevel element) const
 
int BlanksBeforeWord () const
 
virtual char * GetUTF8Text (PageIteratorLevel level) const
 
virtual std::vector< std::vector< std::pair< const char *, float > > > * GetBestLSTMSymbolChoices () const
 
bool ParagraphIsLtr () const
 
- Public Member Functions inherited from tesseract::LTRResultIterator
 LTRResultIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
 
virtual ~LTRResultIterator ()
 
char * GetUTF8Text (PageIteratorLevel level) const
 
void SetLineSeparator (const char *new_line)
 
void SetParagraphSeparator (const char *new_para)
 
float Confidence (PageIteratorLevel level) const
 
void RowAttributes (float *row_height, float *descenders, float *ascenders) const
 
const char * WordFontAttributes (bool *is_bold, bool *is_italic, bool *is_underlined, bool *is_monospace, bool *is_serif, bool *is_smallcaps, int *pointsize, int *font_id) const
 
const char * WordRecognitionLanguage () const
 
StrongScriptDirection WordDirection () const
 
bool WordIsFromDictionary () const
 
int BlanksBeforeWord () const
 
bool WordIsNumeric () const
 
bool HasBlamerInfo () const
 
const void * GetParamsTrainingBundle () const
 
const char * GetBlamerDebug () const
 
const char * GetBlamerMisadaptionDebug () const
 
bool HasTruthString () const
 
bool EquivalentToTruth (const char *str) const
 
char * WordTruthUTF8Text () const
 
char * WordNormedUTF8Text () const
 
const char * WordLattice (int *lattice_size) const
 
bool SymbolIsSuperscript () const
 
bool SymbolIsSubscript () const
 
bool SymbolIsDropcap () const
 
- Public Member Functions inherited from tesseract::PageIterator
 PageIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
 
virtual ~PageIterator ()
 
 PageIterator (const PageIterator &src)
 
const PageIteratoroperator= (const PageIterator &src)
 
bool PositionedAtSameWord (const PAGE_RES_IT *other) const
 
virtual void RestartParagraph ()
 
bool IsWithinFirstTextlineOfParagraph () const
 
virtual void RestartRow ()
 
int Cmp (const PageIterator &other) const
 
void SetBoundingBoxComponents (bool include_upper_dots, bool include_lower_dots)
 
bool BoundingBox (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
 
bool BoundingBox (PageIteratorLevel level, const int padding, int *left, int *top, int *right, int *bottom) const
 
bool BoundingBoxInternal (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
 
bool Empty (PageIteratorLevel level) const
 
PolyBlockType BlockType () const
 
Pta * BlockPolygon () const
 
Pix * GetBinaryImage (PageIteratorLevel level) const
 
Pix * GetImage (PageIteratorLevel level, int padding, Pix *original_img, int *left, int *top) const
 
bool Baseline (PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const
 
void Orientation (tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
 
void ParagraphInfo (tesseract::ParagraphJustification *justification, bool *is_list_item, bool *is_crown, int *first_line_indent) const
 
bool SetWordBlamerBundle (BlamerBundle *blamer_bundle)
 

Static Public Member Functions

static ResultIteratorStartOfParagraph (const LTRResultIterator &resit)
 
static void CalculateTextlineOrder (bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)
 

Static Public Attributes

static const int kMinorRunStart = -1
 
static const int kMinorRunEnd = -2
 
static const int kComplexWord = -3
 

Protected Member Functions

TESS_LOCAL ResultIterator (const LTRResultIterator &resit)
 
- Protected Member Functions inherited from tesseract::PageIterator
TESS_LOCAL void BeginWord (int offset)
 

Additional Inherited Members

- Protected Attributes inherited from tesseract::LTRResultIterator
const char * line_separator_
 
const char * paragraph_separator_
 
- Protected Attributes inherited from tesseract::PageIterator
PAGE_RESpage_res_
 
Tesseracttesseract_
 
PAGE_RES_ITit_
 
WERDword_
 
int word_length_
 
int blob_index_
 
C_BLOB_IT * cblob_it_
 
bool include_upper_dots_
 
bool include_lower_dots_
 
int scale_
 
int scaled_yres_
 
int rect_left_
 
int rect_top_
 
int rect_width_
 
int rect_height_
 

Detailed Description

Definition at line 41 of file resultiterator.h.

Constructor & Destructor Documentation

◆ ~ResultIterator()

virtual tesseract::ResultIterator::~ResultIterator ( )
virtualdefault

ResultIterator is copy constructible! The default copy constructor works just fine for us.

◆ ResultIterator()

tesseract::ResultIterator::ResultIterator ( const LTRResultIterator resit)
explicitprotected

We presume the data associated with the given iterator will outlive us. NB: This is private because it does something that is non-obvious: it resets to the beginning of the paragraph instead of staying wherever resit might have pointed.

Definition at line 35 of file resultiterator.cpp.

36  : LTRResultIterator(resit) {
37  in_minor_direction_ = false;
38  at_beginning_of_minor_run_ = false;
39  preserve_interword_spaces_ = false;
40 
41  BoolParam *p = ParamUtils::FindParam<BoolParam>(
42  "preserve_interword_spaces", GlobalParams()->bool_params,
44  if (p != nullptr) preserve_interword_spaces_ = (bool)(*p);
45 
46  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
47  MoveToLogicalStartOfTextline();
48 }
GenericVector< BoolParam * > bool_params
Definition: params.h:45
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:32
ParamsVectors * params()
Definition: ccutil.h:62
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)

Member Function Documentation

◆ Begin()

void tesseract::ResultIterator::Begin ( )
virtual

Moves the iterator to point to the start of the page to begin an iteration.

Reimplemented from tesseract::PageIterator.

Definition at line 415 of file resultiterator.cpp.

415  {
417  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
418  in_minor_direction_ = false;
419  at_beginning_of_minor_run_ = false;
420  MoveToLogicalStartOfTextline();
421 }

◆ BlanksBeforeWord()

int tesseract::ResultIterator::BlanksBeforeWord ( ) const

Definition at line 555 of file resultiterator.cpp.

555  {
556  if (CurrentParagraphIsLtr()) return LTRResultIterator::BlanksBeforeWord();
557  return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
558 }
virtual bool IsAtBeginningOf(PageIteratorLevel level) const

◆ CalculateTextlineOrder()

void tesseract::ResultIterator::CalculateTextlineOrder ( bool  paragraph_is_ltr,
const GenericVector< StrongScriptDirection > &  word_dirs,
GenericVectorEqEq< int > *  reading_order 
)
static

Yields the reading order as a sequence of indices and (optional) meta-marks for a set of words (given left-to-right). The meta marks are passed as negative values: kMinorRunStart Start of minor direction text. kMinorRunEnd End of minor direction text. kComplexWord The next indexed word contains both left-to-right and right-to-left characters and was treated as neutral.

For example, suppose we have five words in a text line, indexed [0,1,2,3,4] from the leftmost side of the text line. The following are all believable reading_orders:

Left-to-Right (in ltr paragraph): { 0, 1, 2, 3, 4 } Left-to-Right (in rtl paragraph): { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } Right-to-Left (in rtl paragraph): { 4, 3, 2, 1, 0 } Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }

Definition at line 257 of file resultiterator.cpp.

260  {
261  reading_order->truncate(0);
262  if (word_dirs.size() == 0) return;
263 
264  // Take all of the runs of minor direction words and insert them
265  // in reverse order.
266  int minor_direction, major_direction, major_step, start, end;
267  if (paragraph_is_ltr) {
268  start = 0;
269  end = word_dirs.size();
270  major_step = 1;
271  major_direction = DIR_LEFT_TO_RIGHT;
272  minor_direction = DIR_RIGHT_TO_LEFT;
273  } else {
274  start = word_dirs.size() - 1;
275  end = -1;
276  major_step = -1;
277  major_direction = DIR_RIGHT_TO_LEFT;
278  minor_direction = DIR_LEFT_TO_RIGHT;
279  // Special rule: if there are neutral words at the right most side
280  // of a line adjacent to a left-to-right word in the middle of the
281  // line, we interpret the end of the line as a single LTR sequence.
282  if (word_dirs[start] == DIR_NEUTRAL) {
283  int neutral_end = start;
284  while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
285  neutral_end--;
286  }
287  if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
288  // LTR followed by neutrals.
289  // Scan for the beginning of the minor left-to-right run.
290  int left = neutral_end;
291  for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
292  if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
293  }
294  reading_order->push_back(kMinorRunStart);
295  for (int i = left; i < word_dirs.size(); i++) {
296  reading_order->push_back(i);
297  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
298  }
299  reading_order->push_back(kMinorRunEnd);
300  start = left - 1;
301  }
302  }
303  }
304  for (int i = start; i != end;) {
305  if (word_dirs[i] == minor_direction) {
306  int j = i;
307  while (j != end && word_dirs[j] != major_direction)
308  j += major_step;
309  if (j == end) j -= major_step;
310  while (j != i && word_dirs[j] != minor_direction)
311  j -= major_step;
312  // [j..i] is a minor direction run.
313  reading_order->push_back(kMinorRunStart);
314  for (int k = j; k != i; k -= major_step) {
315  reading_order->push_back(k);
316  }
317  reading_order->push_back(i);
318  reading_order->push_back(kMinorRunEnd);
319  i = j + major_step;
320  } else {
321  reading_order->push_back(i);
322  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
323  i += major_step;
324  }
325  }
326 }
int size() const
Definition: genericvector.h:71
static const int kComplexWord
int push_back(T object)
static const int kMinorRunStart
void truncate(int size)
static const int kMinorRunEnd

◆ GetBestLSTMSymbolChoices()

std::vector< std::vector< std::pair< const char *, float > > > * tesseract::ResultIterator::GetBestLSTMSymbolChoices ( ) const
virtual

Returns the LSTM choices for every LSTM timestep for the current word.

Definition at line 607 of file resultiterator.cpp.

607  {
608  if (it_->word() != nullptr) {
609  return &it_->word()->timesteps;
610  } else {
611  return nullptr;
612  }
613 }
WERD_RES * word() const
Definition: pageres.h:751
std::vector< std::vector< std::pair< const char *, float > > > timesteps
Definition: pageres.h:224

◆ GetUTF8Text()

char * tesseract::ResultIterator::GetUTF8Text ( PageIteratorLevel  level) const
virtual

Returns the null terminated UTF-8 encoded text string for the current object at the given level. Use delete [] to free after use.

Definition at line 564 of file resultiterator.cpp.

564  {
565  if (it_->word() == nullptr) return nullptr; // Already at the end!
566  STRING text;
567  switch (level) {
568  case RIL_BLOCK:
569  {
570  ResultIterator pp(*this);
571  do {
572  pp.AppendUTF8ParagraphText(&text);
573  } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
574  }
575  break;
576  case RIL_PARA:
577  AppendUTF8ParagraphText(&text);
578  break;
579  case RIL_TEXTLINE:
580  {
581  ResultIterator it(*this);
582  it.MoveToLogicalStartOfTextline();
583  it.IterateAndAppendUTF8TextlineText(&text);
584  }
585  break;
586  case RIL_WORD:
587  AppendUTF8WordText(&text);
588  break;
589  case RIL_SYMBOL:
590  {
591  bool reading_direction_is_ltr =
592  current_paragraph_is_ltr_ ^ in_minor_direction_;
593  if (at_beginning_of_minor_run_) {
594  text += reading_direction_is_ltr ? kLRM : kRLM;
595  }
596  text = it_->word()->BestUTF8(blob_index_, false);
597  if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
598  }
599  break;
600  }
601  int length = text.length() + 1;
602  char* result = new char[length];
603  strncpy(result, text.string(), length);
604  return result;
605 }
BLOCK_RES * block() const
Definition: pageres.h:757
const char * kRLM
Definition: unicodes.cpp:28
const char * string() const
Definition: strngs.cpp:196
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)
WERD_RES * word() const
Definition: pageres.h:751
const char * BestUTF8(int blob_index, bool in_rtl_context) const
Definition: pageres.h:361
Definition: strngs.h:45
const char * kLRM
Definition: unicodes.cpp:27
int32_t length() const
Definition: strngs.cpp:191

◆ IsAtBeginningOf()

bool tesseract::ResultIterator::IsAtBeginningOf ( PageIteratorLevel  level) const
virtual

IsAtBeginningOf() returns whether we're at the logical beginning of the given level. (as opposed to ResultIterator's left-to-right top-to-bottom order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). For a full description, see pageiterator.h

Reimplemented from tesseract::PageIterator.

Definition at line 498 of file resultiterator.cpp.

498  {
499  if (it_->block() == nullptr) return false; // Already at the end!
500  if (it_->word() == nullptr) return true; // In an image block.
501  if (level == RIL_SYMBOL) return true; // Always at beginning of a symbol.
502 
503  bool at_word_start = IsAtFirstSymbolOfWord();
504  if (level == RIL_WORD) return at_word_start;
505 
506  ResultIterator line_start(*this);
507  // move to the first word in the line...
508  line_start.MoveToLogicalStartOfTextline();
509 
510  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
511  if (level == RIL_TEXTLINE) return at_textline_start;
512 
513  // now we move to the left-most word...
514  line_start.RestartRow();
515  bool at_block_start = at_textline_start &&
516  line_start.it_->block() != line_start.it_->prev_block();
517  if (level == RIL_BLOCK) return at_block_start;
518 
519  bool at_para_start = at_block_start ||
520  (at_textline_start &&
521  line_start.it_->row()->row->para() !=
522  line_start.it_->prev_row()->row->para());
523  if (level == RIL_PARA) return at_para_start;
524 
525  ASSERT_HOST(false); // shouldn't happen.
526  return false;
527 }
BLOCK_RES * block() const
Definition: pageres.h:757
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)
WERD_RES * word() const
Definition: pageres.h:751
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ IsAtFinalElement()

bool tesseract::ResultIterator::IsAtFinalElement ( PageIteratorLevel  level,
PageIteratorLevel  element 
) const
virtual

Implement PageIterator's IsAtFinalElement correctly in a BiDi context. For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we point at the last word in a paragraph. See PageIterator for full comment.

NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the change that the variable next is now a ResultIterator instead of a PageIterator.

Reimplemented from tesseract::PageIterator.

Definition at line 534 of file resultiterator.cpp.

535  {
536  if (Empty(element)) return true; // Already at the end!
537  // The result is true if we step forward by element and find we are
538  // at the the end of the page or at beginning of *all* levels in:
539  // [level, element).
540  // When there is more than one level difference between element and level,
541  // we could for instance move forward one symbol and still be at the first
542  // word on a line, so we also have to be at the first symbol in a word.
543  ResultIterator next(*this);
544  next.Next(element);
545  if (next.Empty(element)) return true; // Reached the end of the page.
546  while (element > level) {
547  element = static_cast<PageIteratorLevel>(element - 1);
548  if (!next.IsAtBeginningOf(element))
549  return false;
550  }
551  return true;
552 }
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)
bool Empty(PageIteratorLevel level) const

◆ Next()

bool tesseract::ResultIterator::Next ( PageIteratorLevel  level)
virtual

Moves to the start of the next object at the given level in the page hierarchy in the appropriate reading order and returns false if the end of the page was reached. NOTE that RIL_SYMBOL will skip non-text blocks, but all other PageIteratorLevel level values will visit each non-text block once. Think of non text blocks as containing a single para, with a single line, with a single imaginary word. Calls to Next with different levels may be freely intermixed. This function iterates words in right-to-left scripts correctly, if the appropriate language has been loaded into Tesseract.

Reimplemented from tesseract::PageIterator.

Definition at line 423 of file resultiterator.cpp.

423  {
424  if (it_->block() == nullptr) return false; // already at end!
425  switch (level) {
426  case RIL_BLOCK: // explicit fall-through
427  case RIL_PARA: // explicit fall-through
428  case RIL_TEXTLINE:
429  if (!PageIterator::Next(level)) return false;
431  // if we've advanced to a new paragraph,
432  // recalculate current_paragraph_is_ltr_
433  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
434  }
435  in_minor_direction_ = false;
436  MoveToLogicalStartOfTextline();
437  return it_->block() != nullptr;
438  case RIL_SYMBOL:
439  {
440  GenericVector<int> blob_order;
441  CalculateBlobOrder(&blob_order);
442  int next_blob = 0;
443  while (next_blob < blob_order.size() &&
444  blob_index_ != blob_order[next_blob])
445  next_blob++;
446  next_blob++;
447  if (next_blob < blob_order.size()) {
448  // we're in the same word; simply advance one blob.
449  BeginWord(blob_order[next_blob]);
450  at_beginning_of_minor_run_ = false;
451  return true;
452  }
453  level = RIL_WORD; // we've fallen through to the next word.
454  }
455  case RIL_WORD: // explicit fall-through.
456  {
457  if (it_->word() == nullptr) return Next(RIL_BLOCK);
458  GenericVectorEqEq<int> word_indices;
459  int this_word_index = LTRWordIndex();
460  CalculateTextlineOrder(current_paragraph_is_ltr_,
461  *this,
462  &word_indices);
463  int final_real_index = word_indices.size() - 1;
464  while (final_real_index > 0 && word_indices[final_real_index] < 0)
465  final_real_index--;
466  for (int i = 0; i < final_real_index; i++) {
467  if (word_indices[i] == this_word_index) {
468  int j = i + 1;
469  for (; j < final_real_index && word_indices[j] < 0; j++) {
470  if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
471  if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
472  }
473  at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
474  // awesome, we move to word_indices[j]
475  if (BidiDebug(3)) {
476  tprintf("Next(RIL_WORD): %d -> %d\n",
477  this_word_index, word_indices[j]);
478  }
480  for (int k = 0; k < word_indices[j]; k++) {
482  }
483  MoveToLogicalStartOfWord();
484  return true;
485  }
486  }
487  if (BidiDebug(3)) {
488  tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
489  }
490  // we're going off the end of the text line.
491  return Next(RIL_TEXTLINE);
492  }
493  }
494  ASSERT_HOST(false); // shouldn't happen.
495  return false;
496 }
BLOCK_RES * block() const
Definition: pageres.h:757
bool IsWithinFirstTextlineOfParagraph() const
int size() const
Definition: genericvector.h:71
static void CalculateTextlineOrder(bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)
TESS_LOCAL void BeginWord(int offset)
WERD_RES * word() const
Definition: pageres.h:751
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
virtual void RestartRow()
static const int kMinorRunStart
virtual bool Next(PageIteratorLevel level)
virtual bool Next(PageIteratorLevel level)
#define ASSERT_HOST(x)
Definition: errcode.h:84
static const int kMinorRunEnd

◆ ParagraphIsLtr()

bool tesseract::ResultIterator::ParagraphIsLtr ( ) const

Return whether the current paragraph's dominant reading direction is left-to-right (as opposed to right-to-left).

Definition at line 55 of file resultiterator.cpp.

55  {
56  return current_paragraph_is_ltr_;
57 }

◆ StartOfParagraph()

ResultIterator * tesseract::ResultIterator::StartOfParagraph ( const LTRResultIterator resit)
static

Definition at line 50 of file resultiterator.cpp.

51  {
52  return new ResultIterator(resit);
53 }
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)

Member Data Documentation

◆ kComplexWord

const int tesseract::ResultIterator::kComplexWord = -3
static

Definition at line 142 of file resultiterator.h.

◆ kMinorRunEnd

const int tesseract::ResultIterator::kMinorRunEnd = -2
static

Definition at line 141 of file resultiterator.h.

◆ kMinorRunStart

const int tesseract::ResultIterator::kMinorRunStart = -1
static

Definition at line 140 of file resultiterator.h.


The documentation for this class was generated from the following files: