tesseract  4.0.0-1-g2a2b
resultiterator.cpp
Go to the documentation of this file.
1 // File: resultiterator.cpp
3 // Description: Iterator for tesseract results that is capable of
4 // iterating in proper reading order over Bi Directional
5 // (e.g. mixed Hebrew and English) text.
6 // Author: David Eger
7 // Created: Fri May 27 13:58:06 PST 2011
8 //
9 // (C) Copyright 2011, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #include "resultiterator.h"
23 
24 #include "allheaders.h"
25 #include "pageres.h"
26 #include "strngs.h"
27 #include "tesseractclass.h"
28 #include "unicharset.h"
29 #include "unicodes.h"
30 #include <set>
31 #include <vector>
32 
33 namespace tesseract {
34 
36  : LTRResultIterator(resit) {
37  in_minor_direction_ = false;
38  at_beginning_of_minor_run_ = false;
39  preserve_interword_spaces_ = false;
40 
41  BoolParam *p = ParamUtils::FindParam<BoolParam>(
42  "preserve_interword_spaces", GlobalParams()->bool_params,
44  if (p != nullptr) preserve_interword_spaces_ = (bool)(*p);
45 
46  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
47  MoveToLogicalStartOfTextline();
48 }
49 
51  const LTRResultIterator &resit) {
52  return new ResultIterator(resit);
53 }
54 
56  return current_paragraph_is_ltr_;
57 }
58 
59 bool ResultIterator::CurrentParagraphIsLtr() const {
60  if (!it_->word())
61  return true; // doesn't matter.
62  LTRResultIterator it(*this);
63  it.RestartParagraph();
64  // Try to figure out the ltr-ness of the paragraph. The rules below
65  // make more sense in the context of a difficult paragraph example.
66  // Here we denote {ltr characters, RTL CHARACTERS}:
67  //
68  // "don't go in there!" DAIS EH
69  // EHT OTNI DEPMUJ FELSMIH NEHT DNA
70  // .GNIDLIUB GNINRUB
71  //
72  // On the first line, the left-most word is LTR and the rightmost word
73  // is RTL. Thus, we are better off taking the majority direction for
74  // the whole paragraph contents. So instead of "the leftmost word is LTR"
75  // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
76  // would not do: Typically an RTL paragraph would *not* start with an LTR
77  // word. So our heuristics are as follows:
78  //
79  // (1) If the first text line has an RTL word in the left-most position
80  // it is RTL.
81  // (2) If the first text line has an LTR word in the right-most position
82  // it is LTR.
83  // (3) If neither of the above is true, take the majority count for the
84  // paragraph -- if there are more rtl words, it is RTL. If there
85  // are more LTR words, it's LTR.
86  bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
87  bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
88  int num_ltr, num_rtl;
89  num_rtl = leftmost_rtl ? 1 : 0;
90  num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
91  for (it.Next(RIL_WORD);
92  !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
93  it.Next(RIL_WORD)) {
94  StrongScriptDirection dir = it.WordDirection();
95  rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
96  num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
97  num_ltr += rightmost_ltr ? 1 : 0;
98  }
99  if (leftmost_rtl)
100  return false;
101  if (rightmost_ltr)
102  return true;
103  // First line is ambiguous. Take statistics on the whole paragraph.
104  if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) do {
105  StrongScriptDirection dir = it.WordDirection();
106  num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
107  num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
108  } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
109  return num_ltr >= num_rtl;
110 }
111 
112 const int ResultIterator::kMinorRunStart = -1;
113 const int ResultIterator::kMinorRunEnd = -2;
114 const int ResultIterator::kComplexWord = -3;
115 
116 void ResultIterator::CalculateBlobOrder(
117  GenericVector<int> *blob_indices) const {
118  bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
119  blob_indices->clear();
120  if (Empty(RIL_WORD)) return;
121  if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
122  // Easy! just return the blobs in order;
123  for (int i = 0; i < word_length_; i++)
124  blob_indices->push_back(i);
125  return;
126  }
127 
128  // The blobs are in left-to-right order, but the current reading context
129  // is right-to-left.
130  const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
131  const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
132  const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
133  const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
134  const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
135  const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
136  const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
137 
138  // Step 1: Scan for and mark European Number sequences
139  // [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
140  GenericVector<int> letter_types;
141  for (int i = 0; i < word_length_; i++) {
142  letter_types.push_back(it_->word()->SymbolDirection(i));
143  }
144  // Convert a single separtor sandwiched between two EN's into an EN.
145  for (int i = 0; i + 2 < word_length_; i++) {
146  if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
147  (letter_types[i + 1] == U_EURO_NUM_SEP ||
148  letter_types[i + 1] == U_COMMON_NUM_SEP)) {
149  letter_types[i + 1] = U_EURO_NUM;
150  }
151  }
152  // Scan for sequences of European Number Terminators around ENs and convert
153  // them to ENs.
154  for (int i = 0; i < word_length_; i++) {
155  if (letter_types[i] == U_EURO_NUM_TERM) {
156  int j = i + 1;
157  while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { j++; }
158  if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
159  // The sequence [i..j] should be converted to all European Numbers.
160  for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
161  }
162  j = i - 1;
163  while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { j--; }
164  if (j > -1 && letter_types[j] == U_EURO_NUM) {
165  // The sequence [j..i] should be converted to all European Numbers.
166  for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
167  }
168  }
169  }
170  // Step 2: Convert all remaining types to either L or R.
171  // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
172  // All other are R.
173  for (int i = 0; i < word_length_;) {
174  int ti = letter_types[i];
175  if (ti == U_LTR || ti == U_EURO_NUM) {
176  // Left to right sequence; scan to the end of it.
177  int last_good = i;
178  for (int j = i + 1; j < word_length_; j++) {
179  int tj = letter_types[j];
180  if (tj == U_LTR || tj == U_EURO_NUM) {
181  last_good = j;
182  } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
183  // do nothing.
184  } else {
185  break;
186  }
187  }
188  // [i..last_good] is the L sequence
189  for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
190  i = last_good + 1;
191  } else {
192  letter_types[i] = U_RTL;
193  i++;
194  }
195  }
196 
197  // At this point, letter_types is entirely U_LTR or U_RTL.
198  for (int i = word_length_ - 1; i >= 0;) {
199  if (letter_types[i] == U_RTL) {
200  blob_indices->push_back(i);
201  i--;
202  } else {
203  // left to right sequence. scan to the beginning.
204  int j = i - 1;
205  for (; j >= 0 && letter_types[j] != U_RTL; j--) { } // pass
206  // Now (j, i] is LTR
207  for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
208  i = j;
209  }
210  }
211  ASSERT_HOST(blob_indices->size() == word_length_);
212 }
213 
214 static void PrintScriptDirs(const GenericVector<StrongScriptDirection> &dirs) {
215  for (int i = 0; i < dirs.size(); i++) {
216  switch (dirs[i]) {
217  case DIR_NEUTRAL: tprintf ("N "); break;
218  case DIR_LEFT_TO_RIGHT: tprintf("L "); break;
219  case DIR_RIGHT_TO_LEFT: tprintf("R "); break;
220  case DIR_MIX: tprintf("Z "); break;
221  default: tprintf("? "); break;
222  }
223  }
224  tprintf("\n");
225 }
226 
228  bool paragraph_is_ltr,
229  const LTRResultIterator &resit,
230  GenericVectorEqEq<int> *word_indices) const {
232  CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
233 }
234 
236  bool paragraph_is_ltr,
237  const LTRResultIterator &resit,
239  GenericVectorEqEq<int> *word_indices) const {
242  directions = (dirs_arg != nullptr) ? dirs_arg : &dirs;
243  directions->truncate(0);
244 
245  // A LTRResultIterator goes strictly left-to-right word order.
246  LTRResultIterator ltr_it(resit);
247  ltr_it.RestartRow();
248  if (ltr_it.Empty(RIL_WORD)) return;
249  do {
250  directions->push_back(ltr_it.WordDirection());
251  } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
252 
253  word_indices->truncate(0);
254  CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
255 }
256 
258  bool paragraph_is_ltr,
259  const GenericVector<StrongScriptDirection> &word_dirs,
260  GenericVectorEqEq<int> *reading_order) {
261  reading_order->truncate(0);
262  if (word_dirs.size() == 0) return;
263 
264  // Take all of the runs of minor direction words and insert them
265  // in reverse order.
266  int minor_direction, major_direction, major_step, start, end;
267  if (paragraph_is_ltr) {
268  start = 0;
269  end = word_dirs.size();
270  major_step = 1;
271  major_direction = DIR_LEFT_TO_RIGHT;
272  minor_direction = DIR_RIGHT_TO_LEFT;
273  } else {
274  start = word_dirs.size() - 1;
275  end = -1;
276  major_step = -1;
277  major_direction = DIR_RIGHT_TO_LEFT;
278  minor_direction = DIR_LEFT_TO_RIGHT;
279  // Special rule: if there are neutral words at the right most side
280  // of a line adjacent to a left-to-right word in the middle of the
281  // line, we interpret the end of the line as a single LTR sequence.
282  if (word_dirs[start] == DIR_NEUTRAL) {
283  int neutral_end = start;
284  while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
285  neutral_end--;
286  }
287  if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
288  // LTR followed by neutrals.
289  // Scan for the beginning of the minor left-to-right run.
290  int left = neutral_end;
291  for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
292  if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
293  }
294  reading_order->push_back(kMinorRunStart);
295  for (int i = left; i < word_dirs.size(); i++) {
296  reading_order->push_back(i);
297  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
298  }
299  reading_order->push_back(kMinorRunEnd);
300  start = left - 1;
301  }
302  }
303  }
304  for (int i = start; i != end;) {
305  if (word_dirs[i] == minor_direction) {
306  int j = i;
307  while (j != end && word_dirs[j] != major_direction)
308  j += major_step;
309  if (j == end) j -= major_step;
310  while (j != i && word_dirs[j] != minor_direction)
311  j -= major_step;
312  // [j..i] is a minor direction run.
313  reading_order->push_back(kMinorRunStart);
314  for (int k = j; k != i; k -= major_step) {
315  reading_order->push_back(k);
316  }
317  reading_order->push_back(i);
318  reading_order->push_back(kMinorRunEnd);
319  i = j + major_step;
320  } else {
321  reading_order->push_back(i);
322  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
323  i += major_step;
324  }
325  }
326 }
327 
328 int ResultIterator::LTRWordIndex() const {
329  int this_word_index = 0;
330  LTRResultIterator textline(*this);
331  textline.RestartRow();
332  while (!textline.PositionedAtSameWord(it_)) {
333  this_word_index++;
334  textline.Next(RIL_WORD);
335  }
336  return this_word_index;
337 }
338 
339 void ResultIterator::MoveToLogicalStartOfWord() {
340  if (word_length_ == 0) {
341  BeginWord(0);
342  return;
343  }
344  GenericVector<int> blob_order;
345  CalculateBlobOrder(&blob_order);
346  if (blob_order.size() == 0 || blob_order[0] == 0) return;
347  BeginWord(blob_order[0]);
348 }
349 
350 bool ResultIterator::IsAtFinalSymbolOfWord() const {
351  if (!it_->word()) return true;
352  GenericVector<int> blob_order;
353  CalculateBlobOrder(&blob_order);
354  return blob_order.size() == 0 || blob_order.back() == blob_index_;
355 }
356 
357 bool ResultIterator::IsAtFirstSymbolOfWord() const {
358  if (!it_->word()) return true;
359  GenericVector<int> blob_order;
360  CalculateBlobOrder(&blob_order);
361  return blob_order.size() == 0 || blob_order[0] == blob_index_;
362 }
363 
364 void ResultIterator::AppendSuffixMarks(STRING *text) const {
365  if (!it_->word()) return;
366  bool reading_direction_is_ltr =
367  current_paragraph_is_ltr_ ^ in_minor_direction_;
368  // scan forward to see what meta-information the word ordering algorithm
369  // left us.
370  // If this word is at the *end* of a minor run, insert the other
371  // direction's mark; else if this was a complex word, insert the
372  // current reading order's mark.
373  GenericVectorEqEq<int> textline_order;
374  CalculateTextlineOrder(current_paragraph_is_ltr_,
375  *this, &textline_order);
376  int this_word_index = LTRWordIndex();
377  int i = textline_order.get_index(this_word_index);
378  if (i < 0) return;
379 
380  int last_non_word_mark = 0;
381  for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
382  last_non_word_mark = textline_order[i];
383  }
384  if (last_non_word_mark == kComplexWord) {
385  *text += reading_direction_is_ltr ? kLRM : kRLM;
386  } else if (last_non_word_mark == kMinorRunEnd) {
387  if (current_paragraph_is_ltr_) {
388  *text += kLRM;
389  } else {
390  *text += kRLM;
391  }
392  }
393 }
394 
395 void ResultIterator::MoveToLogicalStartOfTextline() {
396  GenericVectorEqEq<int> word_indices;
397  RestartRow();
398  CalculateTextlineOrder(current_paragraph_is_ltr_,
399  dynamic_cast<const LTRResultIterator&>(*this),
400  &word_indices);
401  int i = 0;
402  for (; i < word_indices.size() && word_indices[i] < 0; i++) {
403  if (word_indices[i] == kMinorRunStart) in_minor_direction_ = true;
404  else if (word_indices[i] == kMinorRunEnd) in_minor_direction_ = false;
405  }
406  if (in_minor_direction_) at_beginning_of_minor_run_ = true;
407  if (i >= word_indices.size()) return;
408  int first_word_index = word_indices[i];
409  for (int j = 0; j < first_word_index; j++) {
411  }
412  MoveToLogicalStartOfWord();
413 }
414 
417  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
418  in_minor_direction_ = false;
419  at_beginning_of_minor_run_ = false;
420  MoveToLogicalStartOfTextline();
421 }
422 
424  if (it_->block() == nullptr) return false; // already at end!
425  switch (level) {
426  case RIL_BLOCK: // explicit fall-through
427  case RIL_PARA: // explicit fall-through
428  case RIL_TEXTLINE:
429  if (!PageIterator::Next(level)) return false;
431  // if we've advanced to a new paragraph,
432  // recalculate current_paragraph_is_ltr_
433  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
434  }
435  in_minor_direction_ = false;
436  MoveToLogicalStartOfTextline();
437  return it_->block() != nullptr;
438  case RIL_SYMBOL:
439  {
440  GenericVector<int> blob_order;
441  CalculateBlobOrder(&blob_order);
442  int next_blob = 0;
443  while (next_blob < blob_order.size() &&
444  blob_index_ != blob_order[next_blob])
445  next_blob++;
446  next_blob++;
447  if (next_blob < blob_order.size()) {
448  // we're in the same word; simply advance one blob.
449  BeginWord(blob_order[next_blob]);
450  at_beginning_of_minor_run_ = false;
451  return true;
452  }
453  level = RIL_WORD; // we've fallen through to the next word.
454  }
455  case RIL_WORD: // explicit fall-through.
456  {
457  if (it_->word() == nullptr) return Next(RIL_BLOCK);
458  GenericVectorEqEq<int> word_indices;
459  int this_word_index = LTRWordIndex();
460  CalculateTextlineOrder(current_paragraph_is_ltr_,
461  *this,
462  &word_indices);
463  int final_real_index = word_indices.size() - 1;
464  while (final_real_index > 0 && word_indices[final_real_index] < 0)
465  final_real_index--;
466  for (int i = 0; i < final_real_index; i++) {
467  if (word_indices[i] == this_word_index) {
468  int j = i + 1;
469  for (; j < final_real_index && word_indices[j] < 0; j++) {
470  if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
471  if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
472  }
473  at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
474  // awesome, we move to word_indices[j]
475  if (BidiDebug(3)) {
476  tprintf("Next(RIL_WORD): %d -> %d\n",
477  this_word_index, word_indices[j]);
478  }
480  for (int k = 0; k < word_indices[j]; k++) {
482  }
483  MoveToLogicalStartOfWord();
484  return true;
485  }
486  }
487  if (BidiDebug(3)) {
488  tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
489  }
490  // we're going off the end of the text line.
491  return Next(RIL_TEXTLINE);
492  }
493  }
494  ASSERT_HOST(false); // shouldn't happen.
495  return false;
496 }
497 
499  if (it_->block() == nullptr) return false; // Already at the end!
500  if (it_->word() == nullptr) return true; // In an image block.
501  if (level == RIL_SYMBOL) return true; // Always at beginning of a symbol.
502 
503  bool at_word_start = IsAtFirstSymbolOfWord();
504  if (level == RIL_WORD) return at_word_start;
505 
506  ResultIterator line_start(*this);
507  // move to the first word in the line...
508  line_start.MoveToLogicalStartOfTextline();
509 
510  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
511  if (level == RIL_TEXTLINE) return at_textline_start;
512 
513  // now we move to the left-most word...
514  line_start.RestartRow();
515  bool at_block_start = at_textline_start &&
516  line_start.it_->block() != line_start.it_->prev_block();
517  if (level == RIL_BLOCK) return at_block_start;
518 
519  bool at_para_start = at_block_start ||
520  (at_textline_start &&
521  line_start.it_->row()->row->para() !=
522  line_start.it_->prev_row()->row->para());
523  if (level == RIL_PARA) return at_para_start;
524 
525  ASSERT_HOST(false); // shouldn't happen.
526  return false;
527 }
528 
535  PageIteratorLevel element) const {
536  if (Empty(element)) return true; // Already at the end!
537  // The result is true if we step forward by element and find we are
538  // at the the end of the page or at beginning of *all* levels in:
539  // [level, element).
540  // When there is more than one level difference between element and level,
541  // we could for instance move forward one symbol and still be at the first
542  // word on a line, so we also have to be at the first symbol in a word.
543  ResultIterator next(*this);
544  next.Next(element);
545  if (next.Empty(element)) return true; // Reached the end of the page.
546  while (element > level) {
547  element = static_cast<PageIteratorLevel>(element - 1);
548  if (!next.IsAtBeginningOf(element))
549  return false;
550  }
551  return true;
552 }
553 
554 // Returns the number of blanks before the current word.
556  if (CurrentParagraphIsLtr()) return LTRResultIterator::BlanksBeforeWord();
557  return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
558 }
559 
565  if (it_->word() == nullptr) return nullptr; // Already at the end!
566  STRING text;
567  switch (level) {
568  case RIL_BLOCK:
569  {
570  ResultIterator pp(*this);
571  do {
572  pp.AppendUTF8ParagraphText(&text);
573  } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
574  }
575  break;
576  case RIL_PARA:
577  AppendUTF8ParagraphText(&text);
578  break;
579  case RIL_TEXTLINE:
580  {
581  ResultIterator it(*this);
582  it.MoveToLogicalStartOfTextline();
583  it.IterateAndAppendUTF8TextlineText(&text);
584  }
585  break;
586  case RIL_WORD:
587  AppendUTF8WordText(&text);
588  break;
589  case RIL_SYMBOL:
590  {
591  bool reading_direction_is_ltr =
592  current_paragraph_is_ltr_ ^ in_minor_direction_;
593  if (at_beginning_of_minor_run_) {
594  text += reading_direction_is_ltr ? kLRM : kRLM;
595  }
596  text = it_->word()->BestUTF8(blob_index_, false);
597  if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
598  }
599  break;
600  }
601  int length = text.length() + 1;
602  char* result = new char[length];
603  strncpy(result, text.string(), length);
604  return result;
605 }
606 
607 std::vector<std::vector<std::pair<const char*, float>>>* ResultIterator::GetBestLSTMSymbolChoices() const {
608  if (it_->word() != nullptr) {
609  return &it_->word()->timesteps;
610  } else {
611  return nullptr;
612  }
613 }
614 
615 void ResultIterator::AppendUTF8WordText(STRING *text) const {
616  if (!it_->word()) return;
617  ASSERT_HOST(it_->word()->best_choice != nullptr);
618  bool reading_direction_is_ltr =
619  current_paragraph_is_ltr_ ^ in_minor_direction_;
620  if (at_beginning_of_minor_run_) {
621  *text += reading_direction_is_ltr ? kLRM : kRLM;
622  }
623 
624  GenericVector<int> blob_order;
625  CalculateBlobOrder(&blob_order);
626  for (int i = 0; i < blob_order.size(); i++) {
627  *text += it_->word()->BestUTF8(blob_order[i], false);
628  }
629  AppendSuffixMarks(text);
630 }
631 
632 void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
633  if (Empty(RIL_WORD)) {
634  Next(RIL_WORD);
635  return;
636  }
637  if (BidiDebug(1)) {
638  GenericVectorEqEq<int> textline_order;
640  CalculateTextlineOrder(current_paragraph_is_ltr_,
641  *this, &dirs, &textline_order);
642  tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(),
643  current_paragraph_is_ltr_ ? "ltr" : "rtl");
644  PrintScriptDirs(dirs);
645  tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
646  current_paragraph_is_ltr_ ? "ltr" : "rtl");
647  for (int i = 0; i < textline_order.size(); i++) {
648  tprintf("%d ", textline_order[i]);
649  }
650  tprintf("\n");
651  }
652 
653  int words_appended = 0;
654  do {
655  int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space()
656  : (words_appended > 0);
657  for (int i = 0; i < numSpaces; ++i) {
658  *text += " ";
659  }
660  AppendUTF8WordText(text);
661  words_appended++;
662  if (BidiDebug(2)) {
663  tprintf("Num spaces=%d, text=%s\n", numSpaces, text->string());
664  }
665  } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
666  if (BidiDebug(1)) {
667  tprintf("%d words printed\n", words_appended);
668  }
669  *text += line_separator_;
670  // If we just finished a paragraph, add an extra newline.
671  if (IsAtBeginningOf(RIL_PARA)) {
672  *text += paragraph_separator_;
673  }
674 }
675 
676 void ResultIterator::AppendUTF8ParagraphText(STRING *text) const {
677  ResultIterator it(*this);
678  it.RestartParagraph();
679  it.MoveToLogicalStartOfTextline();
680  if (it.Empty(RIL_WORD)) return;
681  do {
682  it.IterateAndAppendUTF8TextlineText(text);
683  } while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA));
684 }
685 
686 bool ResultIterator::BidiDebug(int min_level) const {
687  int debug_level = 1;
688  IntParam *p = ParamUtils::FindParam<IntParam>(
689  "bidi_debug", GlobalParams()->int_params,
691  if (p != nullptr) debug_level = (int32_t)(*p);
692  return debug_level >= min_level;
693 }
694 
695 } // namespace tesseract.
BLOCK_RES * block() const
Definition: pageres.h:757
bool IsWithinFirstTextlineOfParagraph() const
int size() const
Definition: genericvector.h:71
UNICHARSET::Direction SymbolDirection(int blob_index) const
Definition: pageres.h:383
const char * kRLM
Definition: unicodes.cpp:28
GenericVector< IntParam * > int_params
Definition: params.h:44
GenericVector< BoolParam * > bool_params
Definition: params.h:45
ROW_RES * row() const
Definition: pageres.h:754
static void CalculateTextlineOrder(bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)
const char * string() const
Definition: strngs.cpp:196
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)
StrongScriptDirection
Definition: unichar.h:42
virtual char * GetUTF8Text(PageIteratorLevel level) const
TESS_LOCAL void BeginWord(int offset)
uint8_t space()
Definition: werd.h:102
T & back() const
virtual std::vector< std::vector< std::pair< const char *, float > > > * GetBestLSTMSymbolChoices() const
int get_index(const T &object) const
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:32
ROW_RES * prev_row() const
Definition: pageres.h:745
static const int kComplexWord
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
WERD_RES * word() const
Definition: pageres.h:751
ParamsVectors * params()
Definition: ccutil.h:62
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
virtual void RestartRow()
BLOCK_RES * prev_block() const
Definition: pageres.h:748
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
int push_back(T object)
const char * BestUTF8(int blob_index, bool in_rtl_context) const
Definition: pageres.h:361
Definition: strngs.h:45
static const int kMinorRunStart
virtual bool IsAtBeginningOf(PageIteratorLevel level) const
virtual bool Next(PageIteratorLevel level)
bool UnicharsInReadingOrder() const
Definition: pageres.h:425
const char * kLRM
Definition: unicodes.cpp:27
std::vector< std::vector< std::pair< const char *, float > > > timesteps
Definition: pageres.h:224
void truncate(int size)
bool Empty(PageIteratorLevel level) const
virtual bool Next(PageIteratorLevel level)
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
int32_t length() const
Definition: strngs.cpp:191
WERD_CHOICE * best_choice
Definition: pageres.h:235
ROW * row
Definition: pageres.h:143
#define ASSERT_HOST(x)
Definition: errcode.h:84
static const int kMinorRunEnd
PARA * para() const
Definition: ocrrow.h:118
WERD * word
Definition: pageres.h:189