tesseract  5.0.0-alpha-619-ge9db
resultiterator.cpp
Go to the documentation of this file.
1 // File: resultiterator.cpp
3 // Description: Iterator for tesseract results that is capable of
4 // iterating in proper reading order over Bi Directional
5 // (e.g. mixed Hebrew and English) text.
6 // Author: David Eger
7 //
8 // (C) Copyright 2011, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
22 
23 #include <set>
24 #include <vector>
25 #include "allheaders.h"
26 #include "pageres.h"
27 #include <tesseract/strngs.h>
28 #include "tesseractclass.h"
29 #include "unicharset.h"
30 #include "unicodes.h"
31 
32 namespace tesseract {
33 
35  : LTRResultIterator(resit) {
36  in_minor_direction_ = false;
37  at_beginning_of_minor_run_ = false;
38  preserve_interword_spaces_ = false;
39 
40  auto* p = ParamUtils::FindParam<BoolParam>("preserve_interword_spaces",
41  GlobalParams()->bool_params,
43  if (p != nullptr)
44  preserve_interword_spaces_ = (bool)(*p);
45 
46  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
47  MoveToLogicalStartOfTextline();
48 }
49 
51  const LTRResultIterator& resit) {
52  return new ResultIterator(resit);
53 }
54 
56  return current_paragraph_is_ltr_;
57 }
58 
59 bool ResultIterator::CurrentParagraphIsLtr() const {
60  if (!it_->word())
61  return true; // doesn't matter.
62  LTRResultIterator it(*this);
63  it.RestartParagraph();
64  // Try to figure out the ltr-ness of the paragraph. The rules below
65  // make more sense in the context of a difficult paragraph example.
66  // Here we denote {ltr characters, RTL CHARACTERS}:
67  //
68  // "don't go in there!" DAIS EH
69  // EHT OTNI DEPMUJ FELSMIH NEHT DNA
70  // .GNIDLIUB GNINRUB
71  //
72  // On the first line, the left-most word is LTR and the rightmost word
73  // is RTL. Thus, we are better off taking the majority direction for
74  // the whole paragraph contents. So instead of "the leftmost word is LTR"
75  // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
76  // would not do: Typically an RTL paragraph would *not* start with an LTR
77  // word. So our heuristics are as follows:
78  //
79  // (1) If the first text line has an RTL word in the left-most position
80  // it is RTL.
81  // (2) If the first text line has an LTR word in the right-most position
82  // it is LTR.
83  // (3) If neither of the above is true, take the majority count for the
84  // paragraph -- if there are more rtl words, it is RTL. If there
85  // are more LTR words, it's LTR.
86  bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
87  bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
88  int num_ltr, num_rtl;
89  num_rtl = leftmost_rtl ? 1 : 0;
90  num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
91  for (it.Next(RIL_WORD);
92  !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
93  it.Next(RIL_WORD)) {
94  StrongScriptDirection dir = it.WordDirection();
95  rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
96  num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
97  num_ltr += rightmost_ltr ? 1 : 0;
98  }
99  if (leftmost_rtl)
100  return false;
101  if (rightmost_ltr)
102  return true;
103  // First line is ambiguous. Take statistics on the whole paragraph.
104  if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA))
105  do {
106  StrongScriptDirection dir = it.WordDirection();
107  num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
108  num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
109  } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
110  return num_ltr >= num_rtl;
111 }
112 
113 const int ResultIterator::kMinorRunStart = -1;
114 const int ResultIterator::kMinorRunEnd = -2;
115 const int ResultIterator::kComplexWord = -3;
116 
117 void ResultIterator::CalculateBlobOrder(
118  GenericVector<int>* blob_indices) const {
119  bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
120  blob_indices->clear();
121  if (Empty(RIL_WORD))
122  return;
123  if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
124  // Easy! just return the blobs in order;
125  for (int i = 0; i < word_length_; i++) blob_indices->push_back(i);
126  return;
127  }
128 
129  // The blobs are in left-to-right order, but the current reading context
130  // is right-to-left.
131  const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
132  const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
133  const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
134  const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
135  const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
136  const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
137  const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
138 
139  // Step 1: Scan for and mark European Number sequences
140  // [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
141  GenericVector<int> letter_types;
142  for (int i = 0; i < word_length_; i++) {
143  letter_types.push_back(it_->word()->SymbolDirection(i));
144  }
145  // Convert a single separtor sandwiched between two EN's into an EN.
146  for (int i = 0; i + 2 < word_length_; i++) {
147  if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
148  (letter_types[i + 1] == U_EURO_NUM_SEP ||
149  letter_types[i + 1] == U_COMMON_NUM_SEP)) {
150  letter_types[i + 1] = U_EURO_NUM;
151  }
152  }
153  // Scan for sequences of European Number Terminators around ENs and convert
154  // them to ENs.
155  for (int i = 0; i < word_length_; i++) {
156  if (letter_types[i] == U_EURO_NUM_TERM) {
157  int j = i + 1;
158  while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) {
159  j++;
160  }
161  if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
162  // The sequence [i..j] should be converted to all European Numbers.
163  for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
164  }
165  j = i - 1;
166  while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) {
167  j--;
168  }
169  if (j > -1 && letter_types[j] == U_EURO_NUM) {
170  // The sequence [j..i] should be converted to all European Numbers.
171  for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
172  }
173  }
174  }
175  // Step 2: Convert all remaining types to either L or R.
176  // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
177  // All other are R.
178  for (int i = 0; i < word_length_;) {
179  int ti = letter_types[i];
180  if (ti == U_LTR || ti == U_EURO_NUM) {
181  // Left to right sequence; scan to the end of it.
182  int last_good = i;
183  for (int j = i + 1; j < word_length_; j++) {
184  int tj = letter_types[j];
185  if (tj == U_LTR || tj == U_EURO_NUM) {
186  last_good = j;
187  } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
188  // do nothing.
189  } else {
190  break;
191  }
192  }
193  // [i..last_good] is the L sequence
194  for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
195  i = last_good + 1;
196  } else {
197  letter_types[i] = U_RTL;
198  i++;
199  }
200  }
201 
202  // At this point, letter_types is entirely U_LTR or U_RTL.
203  for (int i = word_length_ - 1; i >= 0;) {
204  if (letter_types[i] == U_RTL) {
205  blob_indices->push_back(i);
206  i--;
207  } else {
208  // left to right sequence. scan to the beginning.
209  int j = i - 1;
210  for (; j >= 0 && letter_types[j] != U_RTL; j--) {
211  } // pass
212  // Now (j, i] is LTR
213  for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
214  i = j;
215  }
216  }
217  ASSERT_HOST(blob_indices->size() == word_length_);
218 }
219 
220 static void PrintScriptDirs(const GenericVector<StrongScriptDirection>& dirs) {
221  for (int i = 0; i < dirs.size(); i++) {
222  switch (dirs[i]) {
223  case DIR_NEUTRAL:
224  tprintf("N ");
225  break;
226  case DIR_LEFT_TO_RIGHT:
227  tprintf("L ");
228  break;
229  case DIR_RIGHT_TO_LEFT:
230  tprintf("R ");
231  break;
232  case DIR_MIX:
233  tprintf("Z ");
234  break;
235  default:
236  tprintf("? ");
237  break;
238  }
239  }
240  tprintf("\n");
241 }
242 
244  bool paragraph_is_ltr, const LTRResultIterator& resit,
245  GenericVectorEqEq<int>* word_indices) const {
247  CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
248 }
249 
251  bool paragraph_is_ltr, const LTRResultIterator& resit,
253  GenericVectorEqEq<int>* word_indices) const {
256  directions = (dirs_arg != nullptr) ? dirs_arg : &dirs;
257  directions->truncate(0);
258 
259  // A LTRResultIterator goes strictly left-to-right word order.
260  LTRResultIterator ltr_it(resit);
261  ltr_it.RestartRow();
262  if (ltr_it.Empty(RIL_WORD))
263  return;
264  do {
265  directions->push_back(ltr_it.WordDirection());
266  } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
267 
268  word_indices->truncate(0);
269  CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
270 }
271 
273  bool paragraph_is_ltr,
274  const GenericVector<StrongScriptDirection>& word_dirs,
275  GenericVectorEqEq<int>* reading_order) {
276  reading_order->truncate(0);
277  if (word_dirs.size() == 0)
278  return;
279 
280  // Take all of the runs of minor direction words and insert them
281  // in reverse order.
282  int minor_direction, major_direction, major_step, start, end;
283  if (paragraph_is_ltr) {
284  start = 0;
285  end = word_dirs.size();
286  major_step = 1;
287  major_direction = DIR_LEFT_TO_RIGHT;
288  minor_direction = DIR_RIGHT_TO_LEFT;
289  } else {
290  start = word_dirs.size() - 1;
291  end = -1;
292  major_step = -1;
293  major_direction = DIR_RIGHT_TO_LEFT;
294  minor_direction = DIR_LEFT_TO_RIGHT;
295  // Special rule: if there are neutral words at the right most side
296  // of a line adjacent to a left-to-right word in the middle of the
297  // line, we interpret the end of the line as a single LTR sequence.
298  if (word_dirs[start] == DIR_NEUTRAL) {
299  int neutral_end = start;
300  while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
301  neutral_end--;
302  }
303  if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
304  // LTR followed by neutrals.
305  // Scan for the beginning of the minor left-to-right run.
306  int left = neutral_end;
307  for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
308  if (word_dirs[i] == DIR_LEFT_TO_RIGHT)
309  left = i;
310  }
311  reading_order->push_back(kMinorRunStart);
312  for (int i = left; i < word_dirs.size(); i++) {
313  reading_order->push_back(i);
314  if (word_dirs[i] == DIR_MIX)
315  reading_order->push_back(kComplexWord);
316  }
317  reading_order->push_back(kMinorRunEnd);
318  start = left - 1;
319  }
320  }
321  }
322  for (int i = start; i != end;) {
323  if (word_dirs[i] == minor_direction) {
324  int j = i;
325  while (j != end && word_dirs[j] != major_direction) j += major_step;
326  if (j == end)
327  j -= major_step;
328  while (j != i && word_dirs[j] != minor_direction) j -= major_step;
329  // [j..i] is a minor direction run.
330  reading_order->push_back(kMinorRunStart);
331  for (int k = j; k != i; k -= major_step) {
332  reading_order->push_back(k);
333  }
334  reading_order->push_back(i);
335  reading_order->push_back(kMinorRunEnd);
336  i = j + major_step;
337  } else {
338  reading_order->push_back(i);
339  if (word_dirs[i] == DIR_MIX)
340  reading_order->push_back(kComplexWord);
341  i += major_step;
342  }
343  }
344 }
345 
346 int ResultIterator::LTRWordIndex() const {
347  int this_word_index = 0;
348  LTRResultIterator textline(*this);
349  textline.RestartRow();
350  while (!textline.PositionedAtSameWord(it_)) {
351  this_word_index++;
352  textline.Next(RIL_WORD);
353  }
354  return this_word_index;
355 }
356 
357 void ResultIterator::MoveToLogicalStartOfWord() {
358  if (word_length_ == 0) {
359  BeginWord(0);
360  return;
361  }
362  GenericVector<int> blob_order;
363  CalculateBlobOrder(&blob_order);
364  if (blob_order.size() == 0 || blob_order[0] == 0)
365  return;
366  BeginWord(blob_order[0]);
367 }
368 
369 bool ResultIterator::IsAtFinalSymbolOfWord() const {
370  if (!it_->word())
371  return true;
372  GenericVector<int> blob_order;
373  CalculateBlobOrder(&blob_order);
374  return blob_order.size() == 0 || blob_order.back() == blob_index_;
375 }
376 
377 bool ResultIterator::IsAtFirstSymbolOfWord() const {
378  if (!it_->word())
379  return true;
380  GenericVector<int> blob_order;
381  CalculateBlobOrder(&blob_order);
382  return blob_order.size() == 0 || blob_order[0] == blob_index_;
383 }
384 
385 void ResultIterator::AppendSuffixMarks(STRING* text) const {
386  if (!it_->word())
387  return;
388  bool reading_direction_is_ltr =
389  current_paragraph_is_ltr_ ^ in_minor_direction_;
390  // scan forward to see what meta-information the word ordering algorithm
391  // left us.
392  // If this word is at the *end* of a minor run, insert the other
393  // direction's mark; else if this was a complex word, insert the
394  // current reading order's mark.
395  GenericVectorEqEq<int> textline_order;
396  CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &textline_order);
397  int this_word_index = LTRWordIndex();
398  int i = textline_order.get_index(this_word_index);
399  if (i < 0)
400  return;
401 
402  int last_non_word_mark = 0;
403  for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
404  last_non_word_mark = textline_order[i];
405  }
406  if (last_non_word_mark == kComplexWord) {
407  *text += reading_direction_is_ltr ? kLRM : kRLM;
408  } else if (last_non_word_mark == kMinorRunEnd) {
409  if (current_paragraph_is_ltr_) {
410  *text += kLRM;
411  } else {
412  *text += kRLM;
413  }
414  }
415 }
416 
417 void ResultIterator::MoveToLogicalStartOfTextline() {
418  GenericVectorEqEq<int> word_indices;
419  RestartRow();
420  CalculateTextlineOrder(current_paragraph_is_ltr_,
421  dynamic_cast<const LTRResultIterator&>(*this),
422  &word_indices);
423  int i = 0;
424  for (; i < word_indices.size() && word_indices[i] < 0; i++) {
425  if (word_indices[i] == kMinorRunStart)
426  in_minor_direction_ = true;
427  else if (word_indices[i] == kMinorRunEnd)
428  in_minor_direction_ = false;
429  }
430  if (in_minor_direction_)
431  at_beginning_of_minor_run_ = true;
432  if (i >= word_indices.size())
433  return;
434  int first_word_index = word_indices[i];
435  for (int j = 0; j < first_word_index; j++) {
437  }
438  MoveToLogicalStartOfWord();
439 }
440 
443  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
444  in_minor_direction_ = false;
445  at_beginning_of_minor_run_ = false;
446  MoveToLogicalStartOfTextline();
447 }
448 
450  if (it_->block() == nullptr)
451  return false; // already at end!
452  switch (level) {
453  case RIL_BLOCK: // explicit fall-through
454  case RIL_PARA: // explicit fall-through
455  case RIL_TEXTLINE:
456  if (!PageIterator::Next(level))
457  return false;
459  // if we've advanced to a new paragraph,
460  // recalculate current_paragraph_is_ltr_
461  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
462  }
463  in_minor_direction_ = false;
464  MoveToLogicalStartOfTextline();
465  return it_->block() != nullptr;
466  case RIL_SYMBOL: {
467  GenericVector<int> blob_order;
468  CalculateBlobOrder(&blob_order);
469  int next_blob = 0;
470  while (next_blob < blob_order.size() &&
471  blob_index_ != blob_order[next_blob])
472  next_blob++;
473  next_blob++;
474  if (next_blob < blob_order.size()) {
475  // we're in the same word; simply advance one blob.
476  BeginWord(blob_order[next_blob]);
477  at_beginning_of_minor_run_ = false;
478  return true;
479  }
480  level = RIL_WORD; // we've fallen through to the next word.
481  }
482  // Fall through.
483  case RIL_WORD: // explicit fall-through.
484  {
485  if (it_->word() == nullptr)
486  return Next(RIL_BLOCK);
487  GenericVectorEqEq<int> word_indices;
488  int this_word_index = LTRWordIndex();
489  CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices);
490  int final_real_index = word_indices.size() - 1;
491  while (final_real_index > 0 && word_indices[final_real_index] < 0)
492  final_real_index--;
493  for (int i = 0; i < final_real_index; i++) {
494  if (word_indices[i] == this_word_index) {
495  int j = i + 1;
496  for (; j < final_real_index && word_indices[j] < 0; j++) {
497  if (word_indices[j] == kMinorRunStart)
498  in_minor_direction_ = true;
499  if (word_indices[j] == kMinorRunEnd)
500  in_minor_direction_ = false;
501  }
502  at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
503  // awesome, we move to word_indices[j]
504  if (BidiDebug(3)) {
505  tprintf("Next(RIL_WORD): %d -> %d\n", this_word_index,
506  word_indices[j]);
507  }
509  for (int k = 0; k < word_indices[j]; k++) {
511  }
512  MoveToLogicalStartOfWord();
513  return true;
514  }
515  }
516  if (BidiDebug(3)) {
517  tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
518  }
519  // we're going off the end of the text line.
520  return Next(RIL_TEXTLINE);
521  }
522  }
523  ASSERT_HOST(false); // shouldn't happen.
524  return false;
525 }
526 
528  if (it_->block() == nullptr)
529  return false; // Already at the end!
530  if (it_->word() == nullptr)
531  return true; // In an image block.
532  if (level == RIL_SYMBOL)
533  return true; // Always at beginning of a symbol.
534 
535  bool at_word_start = IsAtFirstSymbolOfWord();
536  if (level == RIL_WORD)
537  return at_word_start;
538 
539  ResultIterator line_start(*this);
540  // move to the first word in the line...
541  line_start.MoveToLogicalStartOfTextline();
542 
543  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
544  if (level == RIL_TEXTLINE)
545  return at_textline_start;
546 
547  // now we move to the left-most word...
548  line_start.RestartRow();
549  bool at_block_start = at_textline_start &&
550  line_start.it_->block() != line_start.it_->prev_block();
551  if (level == RIL_BLOCK)
552  return at_block_start;
553 
554  bool at_para_start =
555  at_block_start ||
556  (at_textline_start && line_start.it_->row()->row->para() !=
557  line_start.it_->prev_row()->row->para());
558  if (level == RIL_PARA)
559  return at_para_start;
560 
561  ASSERT_HOST(false); // shouldn't happen.
562  return false;
563 }
564 
571  PageIteratorLevel element) const {
572  if (Empty(element))
573  return true; // Already at the end!
574  // The result is true if we step forward by element and find we are
575  // at the the end of the page or at beginning of *all* levels in:
576  // [level, element).
577  // When there is more than one level difference between element and level,
578  // we could for instance move forward one symbol and still be at the first
579  // word on a line, so we also have to be at the first symbol in a word.
580  ResultIterator next(*this);
581  next.Next(element);
582  if (next.Empty(element))
583  return true; // Reached the end of the page.
584  while (element > level) {
585  element = static_cast<PageIteratorLevel>(element - 1);
586  if (!next.IsAtBeginningOf(element))
587  return false;
588  }
589  return true;
590 }
591 
592 // Returns the number of blanks before the current word.
594  if (CurrentParagraphIsLtr())
596  return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
597 }
598 
604  if (it_->word() == nullptr)
605  return nullptr; // Already at the end!
606  STRING text;
607  switch (level) {
608  case RIL_BLOCK: {
609  ResultIterator pp(*this);
610  do {
611  pp.AppendUTF8ParagraphText(&text);
612  } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
613  } break;
614  case RIL_PARA:
615  AppendUTF8ParagraphText(&text);
616  break;
617  case RIL_TEXTLINE: {
618  ResultIterator it(*this);
619  it.MoveToLogicalStartOfTextline();
620  it.IterateAndAppendUTF8TextlineText(&text);
621  } break;
622  case RIL_WORD:
623  AppendUTF8WordText(&text);
624  break;
625  case RIL_SYMBOL: {
626  bool reading_direction_is_ltr =
627  current_paragraph_is_ltr_ ^ in_minor_direction_;
628  if (at_beginning_of_minor_run_) {
629  text += reading_direction_is_ltr ? kLRM : kRLM;
630  }
631  text = it_->word()->BestUTF8(blob_index_, false);
632  if (IsAtFinalSymbolOfWord())
633  AppendSuffixMarks(&text);
634  } break;
635  }
636  int length = text.length() + 1;
637  char* result = new char[length];
638  strncpy(result, text.c_str(), length);
639  return result;
640 }
641 std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
643  if (it_->word() != nullptr) {
644  return &it_->word()->segmented_timesteps;
645  } else {
646  return nullptr;
647  }
648 }
649 
650 std::vector<std::vector<std::pair<const char*, float>>>*
652  if (it_->word() != nullptr) {
653  return &it_->word()->CTC_symbol_choices;
654  } else {
655  return nullptr;
656  }
657 }
658 
659 void ResultIterator::AppendUTF8WordText(STRING* text) const {
660  if (!it_->word())
661  return;
662  ASSERT_HOST(it_->word()->best_choice != nullptr);
663  bool reading_direction_is_ltr =
664  current_paragraph_is_ltr_ ^ in_minor_direction_;
665  if (at_beginning_of_minor_run_) {
666  *text += reading_direction_is_ltr ? kLRM : kRLM;
667  }
668 
669  GenericVector<int> blob_order;
670  CalculateBlobOrder(&blob_order);
671  for (int i = 0; i < blob_order.size(); i++) {
672  *text += it_->word()->BestUTF8(blob_order[i], false);
673  }
674  AppendSuffixMarks(text);
675 }
676 
677 void ResultIterator::IterateAndAppendUTF8TextlineText(STRING* text) {
678  if (Empty(RIL_WORD)) {
679  Next(RIL_WORD);
680  return;
681  }
682  if (BidiDebug(1)) {
683  GenericVectorEqEq<int> textline_order;
685  CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &dirs,
686  &textline_order);
687  tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(),
688  current_paragraph_is_ltr_ ? "ltr" : "rtl");
689  PrintScriptDirs(dirs);
690  tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
691  current_paragraph_is_ltr_ ? "ltr" : "rtl");
692  for (int i = 0; i < textline_order.size(); i++) {
693  tprintf("%d ", textline_order[i]);
694  }
695  tprintf("\n");
696  }
697 
698  int words_appended = 0;
699  do {
700  int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space()
701  : (words_appended > 0);
702  for (int i = 0; i < numSpaces; ++i) {
703  *text += " ";
704  }
705  AppendUTF8WordText(text);
706  words_appended++;
707  if (BidiDebug(2)) {
708  tprintf("Num spaces=%d, text=%s\n", numSpaces, text->c_str());
709  }
710  } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
711  if (BidiDebug(1)) {
712  tprintf("%d words printed\n", words_appended);
713  }
714  *text += line_separator_;
715  // If we just finished a paragraph, add an extra newline.
716  if (IsAtBeginningOf(RIL_PARA)) {
717  *text += paragraph_separator_;
718  }
719 }
720 
721 void ResultIterator::AppendUTF8ParagraphText(STRING* text) const {
722  ResultIterator it(*this);
723  it.RestartParagraph();
724  it.MoveToLogicalStartOfTextline();
725  if (it.Empty(RIL_WORD))
726  return;
727  do {
728  it.IterateAndAppendUTF8TextlineText(text);
729  } while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA));
730 }
731 
732 bool ResultIterator::BidiDebug(int min_level) const {
733  int debug_level = 1;
734  auto* p =
735  ParamUtils::FindParam<IntParam>("bidi_debug", GlobalParams()->int_params,
737  if (p != nullptr)
738  debug_level = (int32_t)(*p);
739  return debug_level >= min_level;
740 }
741 
742 } // namespace tesseract.
tesseract::ResultIterator::ResultIterator
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)
Definition: resultiterator.cpp:34
strngs.h
tesseract::kRLM
const char *const kRLM
Right-to-Left Mark.
Definition: unicodes.cpp:39
tesseract::ResultIterator::GetRawLSTMTimesteps
virtual std::vector< std::vector< std::vector< std::pair< const char *, float > > > > * GetRawLSTMTimesteps() const
Definition: resultiterator.cpp:642
tesseract::ResultIterator::IsAtFinalElement
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
Definition: resultiterator.cpp:570
ROW::para
PARA * para() const
Definition: ocrrow.h:117
UNICHARSET::U_EUROPEAN_NUMBER
Definition: unicharset.h:159
tesseract::RIL_WORD
Definition: publictypes.h:220
pageres.h
tesseract::PageIterator::RestartRow
virtual void RestartRow()
Definition: pageiterator.cpp:129
tesseract::ResultIterator::Begin
void Begin() override
Definition: resultiterator.cpp:441
tesseract::ResultIterator::CalculateTextlineOrder
static void CalculateTextlineOrder(bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)
Definition: resultiterator.cpp:272
tesseract::PageIterator::it_
PAGE_RES_IT * it_
Definition: pageiterator.h:332
tesseractclass.h
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::RIL_BLOCK
Definition: publictypes.h:217
WERD_RES::BestUTF8
const char * BestUTF8(int blob_index, bool in_rtl_context) const
Definition: pageres.h:357
PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:754
tesseract::ResultIterator::kMinorRunStart
static const int kMinorRunStart
Definition: resultiterator.h:146
PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:751
StrongScriptDirection
StrongScriptDirection
Definition: unichar.h:43
tesseract::ResultIterator::IsAtBeginningOf
bool IsAtBeginningOf(PageIteratorLevel level) const override
Definition: resultiterator.cpp:527
STRING
Definition: strngs.h:45
tesseract::ParamsVectors::int_params
GenericVector< IntParam * > int_params
Definition: params.h:57
tesseract::LTRResultIterator::line_separator_
const char * line_separator_
Definition: ltrresultiterator.h:181
UNICHARSET::U_OTHER_NEUTRAL
Definition: unicharset.h:167
UNICHARSET::U_LEFT_TO_RIGHT
Definition: unicharset.h:157
PAGE_RES_IT::prev_row
ROW_RES * prev_row() const
Definition: pageres.h:742
tesseract::LTRResultIterator::LTRResultIterator
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
Definition: ltrresultiterator.cpp:29
tesseract::ResultIterator::BlanksBeforeWord
int BlanksBeforeWord() const
Definition: resultiterator.cpp:593
tesseract::LTRResultIterator
Definition: ltrresultiterator.h:47
tesseract::LTRResultIterator::paragraph_separator_
const char * paragraph_separator_
Definition: ltrresultiterator.h:182
tesseract::PageIterator::IsWithinFirstTextlineOfParagraph
bool IsWithinFirstTextlineOfParagraph() const
Definition: pageiterator.cpp:123
GenericVector::back
T & back() const
Definition: genericvector.h:728
resultiterator.h
DIR_LEFT_TO_RIGHT
Definition: unichar.h:45
tesseract::RIL_SYMBOL
Definition: publictypes.h:221
UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR
Definition: unicharset.h:160
tesseract::ResultIterator::GetBestLSTMSymbolChoices
virtual std::vector< std::vector< std::pair< const char *, float > > > * GetBestLSTMSymbolChoices() const
Definition: resultiterator.cpp:651
WERD_RES::segmented_timesteps
std::vector< std::vector< std::vector< std::pair< const char *, float > > > > segmented_timesteps
Definition: pageres.h:218
tesseract::PageIterator::word_length_
int word_length_
Definition: pageiterator.h:339
tesseract::kLRM
const char *const kLRM
Left-to-Right Mark.
Definition: unicodes.cpp:38
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
WERD::space
uint8_t space()
Definition: werd.h:98
tesseract::PageIterator::Empty
bool Empty(PageIteratorLevel level) const
Definition: pageiterator.cpp:349
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::PageIterator::Next
virtual bool Next(PageIteratorLevel level)
Definition: pageiterator.cpp:147
unicharset.h
ROW_RES::row
ROW * row
Definition: pageres.h:136
UNICHARSET::U_COMMON_NUMBER_SEPARATOR
Definition: unicharset.h:163
tesseract::PageIterator::tesseract_
Tesseract * tesseract_
Definition: pageiterator.h:327
tesseract::ResultIterator::kComplexWord
static const int kComplexWord
Definition: resultiterator.h:148
WERD_RES::SymbolDirection
UNICHARSET::Direction SymbolDirection(int blob_index) const
Definition: pageres.h:379
tesseract::ResultIterator::StartOfParagraph
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
Definition: resultiterator.cpp:50
WERD_RES::CTC_symbol_choices
std::vector< std::vector< std::pair< const char *, float > > > CTC_symbol_choices
Definition: pageres.h:220
PAGE_RES_IT::prev_block
BLOCK_RES * prev_block() const
Definition: pageres.h:745
tesseract::ResultIterator::kMinorRunEnd
static const int kMinorRunEnd
Definition: resultiterator.h:147
tesseract::LTRResultIterator::BlanksBeforeWord
int BlanksBeforeWord() const
Definition: ltrresultiterator.cpp:244
GlobalParams
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:32
tesseract::PageIteratorLevel
PageIteratorLevel
Definition: publictypes.h:216
GenericVector::get_index
int get_index(const T &object) const
Definition: genericvector.h:781
tesseract
Definition: baseapi.h:65
unicodes.h
PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:748
tesseract::CCUtil::params
ParamsVectors * params()
Definition: ccutil.h:51
GenericVectorEqEq< int >
tesseract::RIL_TEXTLINE
Definition: publictypes.h:219
tesseract::ResultIterator::Next
bool Next(PageIteratorLevel level) override
Definition: resultiterator.cpp:449
GenericVector< int >
DIR_NEUTRAL
Definition: unichar.h:44
tesseract::ResultIterator
Definition: resultiterator.h:44
UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR
Definition: unicharset.h:161
tesseract::PageIterator::Begin
virtual void Begin()
Definition: pageiterator.cpp:105
STRING::length
int32_t length() const
Definition: strngs.cpp:187
GenericVector::truncate
void truncate(int size)
Definition: genericvector.h:132
GenericVector::clear
void clear()
Definition: genericvector.h:857
tesseract::ResultIterator::GetUTF8Text
virtual char * GetUTF8Text(PageIteratorLevel level) const
Definition: resultiterator.cpp:603
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
DIR_RIGHT_TO_LEFT
Definition: unichar.h:46
DIR_MIX
Definition: unichar.h:47
WERD_RES::word
WERD * word
Definition: pageres.h:180
tesseract::PageIterator::BeginWord
TESS_LOCAL void BeginWord(int offset)
Definition: pageiterator.cpp:585
WERD_RES::UnicharsInReadingOrder
bool UnicharsInReadingOrder() const
Definition: pageres.h:421
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::RIL_PARA
Definition: publictypes.h:218
tesseract::ResultIterator::ParagraphIsLtr
bool ParagraphIsLtr() const
Definition: resultiterator.cpp:55
UNICHARSET::U_RIGHT_TO_LEFT
Definition: unicharset.h:158
tesseract::ParamsVectors::bool_params
GenericVector< BoolParam * > bool_params
Definition: params.h:58
tesseract::PageIterator::blob_index_
int blob_index_
Definition: pageiterator.h:341