All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
resultiterator.cpp
Go to the documentation of this file.
1 // File: resultiterator.cpp
3 // Description: Iterator for tesseract results that is capable of
4 // iterating in proper reading order over Bi Directional
5 // (e.g. mixed Hebrew and English) text.
6 // Author: David Eger
7 // Created: Fri May 27 13:58:06 PST 2011
8 //
9 // (C) Copyright 2011, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #include "resultiterator.h"
23 
24 #include "allheaders.h"
25 #include "pageres.h"
26 #include "strngs.h"
27 #include "tesseractclass.h"
28 #include "unicharset.h"
29 #include "unicodes.h"
30 
31 namespace tesseract {
32 
34  : LTRResultIterator(resit) {
35  in_minor_direction_ = false;
36  at_beginning_of_minor_run_ = false;
37  preserve_interword_spaces_ = false;
38 
39  BoolParam *p = ParamUtils::FindParam<BoolParam>(
40  "preserve_interword_spaces", GlobalParams()->bool_params,
42  if (p != NULL) preserve_interword_spaces_ = (bool)(*p);
43 
44  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
45  MoveToLogicalStartOfTextline();
46 }
47 
49  const LTRResultIterator &resit) {
50  return new ResultIterator(resit);
51 }
52 
54  return current_paragraph_is_ltr_;
55 }
56 
57 bool ResultIterator::CurrentParagraphIsLtr() const {
58  if (!it_->word())
59  return true; // doesn't matter.
60  LTRResultIterator it(*this);
61  it.RestartParagraph();
62  // Try to figure out the ltr-ness of the paragraph. The rules below
63  // make more sense in the context of a difficult paragraph example.
64  // Here we denote {ltr characters, RTL CHARACTERS}:
65  //
66  // "don't go in there!" DAIS EH
67  // EHT OTNI DEPMUJ FELSMIH NEHT DNA
68  // .GNIDLIUB GNINRUB
69  //
70  // On the first line, the left-most word is LTR and the rightmost word
71  // is RTL. Thus, we are better off taking the majority direction for
72  // the whole paragraph contents. So instead of "the leftmost word is LTR"
73  // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
74  // would not do: Typically an RTL paragraph would *not* start with an LTR
75  // word. So our heuristics are as follows:
76  //
77  // (1) If the first text line has an RTL word in the left-most position
78  // it is RTL.
79  // (2) If the first text line has an LTR word in the right-most position
80  // it is LTR.
81  // (3) If neither of the above is true, take the majority count for the
82  // paragraph -- if there are more rtl words, it is RTL. If there
83  // are more LTR words, it's LTR.
84  bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
85  bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
86  int num_ltr, num_rtl;
87  num_rtl = leftmost_rtl ? 1 : 0;
88  num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
89  for (it.Next(RIL_WORD);
90  !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
91  it.Next(RIL_WORD)) {
92  StrongScriptDirection dir = it.WordDirection();
93  rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
94  num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
95  num_ltr += rightmost_ltr ? 1 : 0;
96  }
97  if (leftmost_rtl)
98  return false;
99  if (rightmost_ltr)
100  return true;
101  // First line is ambiguous. Take statistics on the whole paragraph.
102  if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) do {
103  StrongScriptDirection dir = it.WordDirection();
104  num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
105  num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
106  } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
107  return num_ltr >= num_rtl;
108 }
109 
110 const int ResultIterator::kMinorRunStart = -1;
111 const int ResultIterator::kMinorRunEnd = -2;
112 const int ResultIterator::kComplexWord = -3;
113 
114 void ResultIterator::CalculateBlobOrder(
115  GenericVector<int> *blob_indices) const {
116  bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
117  blob_indices->clear();
118  if (Empty(RIL_WORD)) return;
119  if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
120  // Easy! just return the blobs in order;
121  for (int i = 0; i < word_length_; i++)
122  blob_indices->push_back(i);
123  return;
124  }
125 
126  // The blobs are in left-to-right order, but the current reading context
127  // is right-to-left.
128  const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
129  const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
130  const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
131  const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
132  const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
133  const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
134  const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
135 
136  // Step 1: Scan for and mark European Number sequences
137  // [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
138  GenericVector<int> letter_types;
139  for (int i = 0; i < word_length_; i++) {
140  letter_types.push_back(it_->word()->SymbolDirection(i));
141  }
142  // Convert a single separtor sandwiched between two EN's into an EN.
143  for (int i = 0; i + 2 < word_length_; i++) {
144  if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
145  (letter_types[i + 1] == U_EURO_NUM_SEP ||
146  letter_types[i + 1] == U_COMMON_NUM_SEP)) {
147  letter_types[i + 1] = U_EURO_NUM;
148  }
149  }
150  // Scan for sequences of European Number Terminators around ENs and convert
151  // them to ENs.
152  for (int i = 0; i < word_length_; i++) {
153  if (letter_types[i] == U_EURO_NUM_TERM) {
154  int j = i + 1;
155  while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { j++; }
156  if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
157  // The sequence [i..j] should be converted to all European Numbers.
158  for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
159  }
160  j = i - 1;
161  while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { j--; }
162  if (j > -1 && letter_types[j] == U_EURO_NUM) {
163  // The sequence [j..i] should be converted to all European Numbers.
164  for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
165  }
166  }
167  }
168  // Step 2: Convert all remaining types to either L or R.
169  // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
170  // All other are R.
171  for (int i = 0; i < word_length_;) {
172  int ti = letter_types[i];
173  if (ti == U_LTR || ti == U_EURO_NUM) {
174  // Left to right sequence; scan to the end of it.
175  int last_good = i;
176  for (int j = i + 1; j < word_length_; j++) {
177  int tj = letter_types[j];
178  if (tj == U_LTR || tj == U_EURO_NUM) {
179  last_good = j;
180  } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
181  // do nothing.
182  } else {
183  break;
184  }
185  }
186  // [i..last_good] is the L sequence
187  for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
188  i = last_good + 1;
189  } else {
190  letter_types[i] = U_RTL;
191  i++;
192  }
193  }
194 
195  // At this point, letter_types is entirely U_LTR or U_RTL.
196  for (int i = word_length_ - 1; i >= 0;) {
197  if (letter_types[i] == U_RTL) {
198  blob_indices->push_back(i);
199  i--;
200  } else {
201  // left to right sequence. scan to the beginning.
202  int j = i - 1;
203  for (; j >= 0 && letter_types[j] != U_RTL; j--) { } // pass
204  // Now (j, i] is LTR
205  for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
206  i = j;
207  }
208  }
209  ASSERT_HOST(blob_indices->size() == word_length_);
210 }
211 
212 static void PrintScriptDirs(const GenericVector<StrongScriptDirection> &dirs) {
213  for (int i = 0; i < dirs.size(); i++) {
214  switch (dirs[i]) {
215  case DIR_NEUTRAL: tprintf ("N "); break;
216  case DIR_LEFT_TO_RIGHT: tprintf("L "); break;
217  case DIR_RIGHT_TO_LEFT: tprintf("R "); break;
218  case DIR_MIX: tprintf("Z "); break;
219  default: tprintf("? "); break;
220  }
221  }
222  tprintf("\n");
223 }
224 
226  bool paragraph_is_ltr,
227  const LTRResultIterator &resit,
228  GenericVectorEqEq<int> *word_indices) const {
230  CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
231 }
232 
234  bool paragraph_is_ltr,
235  const LTRResultIterator &resit,
237  GenericVectorEqEq<int> *word_indices) const {
240  directions = (dirs_arg != NULL) ? dirs_arg : &dirs;
241  directions->truncate(0);
242 
243  // A LTRResultIterator goes strictly left-to-right word order.
244  LTRResultIterator ltr_it(resit);
245  ltr_it.RestartRow();
246  if (ltr_it.Empty(RIL_WORD)) return;
247  do {
248  directions->push_back(ltr_it.WordDirection());
249  } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
250 
251  word_indices->truncate(0);
252  CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
253 }
254 
256  bool paragraph_is_ltr,
257  const GenericVector<StrongScriptDirection> &word_dirs,
258  GenericVectorEqEq<int> *reading_order) {
259  reading_order->truncate(0);
260  if (word_dirs.size() == 0) return;
261 
262  // Take all of the runs of minor direction words and insert them
263  // in reverse order.
264  int minor_direction, major_direction, major_step, start, end;
265  if (paragraph_is_ltr) {
266  start = 0;
267  end = word_dirs.size();
268  major_step = 1;
269  major_direction = DIR_LEFT_TO_RIGHT;
270  minor_direction = DIR_RIGHT_TO_LEFT;
271  } else {
272  start = word_dirs.size() - 1;
273  end = -1;
274  major_step = -1;
275  major_direction = DIR_RIGHT_TO_LEFT;
276  minor_direction = DIR_LEFT_TO_RIGHT;
277  // Special rule: if there are neutral words at the right most side
278  // of a line adjacent to a left-to-right word in the middle of the
279  // line, we interpret the end of the line as a single LTR sequence.
280  if (word_dirs[start] == DIR_NEUTRAL) {
281  int neutral_end = start;
282  while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
283  neutral_end--;
284  }
285  if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
286  // LTR followed by neutrals.
287  // Scan for the beginning of the minor left-to-right run.
288  int left = neutral_end;
289  for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
290  if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
291  }
292  reading_order->push_back(kMinorRunStart);
293  for (int i = left; i < word_dirs.size(); i++) {
294  reading_order->push_back(i);
295  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
296  }
297  reading_order->push_back(kMinorRunEnd);
298  start = left - 1;
299  }
300  }
301  }
302  for (int i = start; i != end;) {
303  if (word_dirs[i] == minor_direction) {
304  int j = i;
305  while (j != end && word_dirs[j] != major_direction)
306  j += major_step;
307  if (j == end) j -= major_step;
308  while (j != i && word_dirs[j] != minor_direction)
309  j -= major_step;
310  // [j..i] is a minor direction run.
311  reading_order->push_back(kMinorRunStart);
312  for (int k = j; k != i; k -= major_step) {
313  reading_order->push_back(k);
314  }
315  reading_order->push_back(i);
316  reading_order->push_back(kMinorRunEnd);
317  i = j + major_step;
318  } else {
319  reading_order->push_back(i);
320  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
321  i += major_step;
322  }
323  }
324 }
325 
326 int ResultIterator::LTRWordIndex() const {
327  int this_word_index = 0;
328  LTRResultIterator textline(*this);
329  textline.RestartRow();
330  while (!textline.PositionedAtSameWord(it_)) {
331  this_word_index++;
332  textline.Next(RIL_WORD);
333  }
334  return this_word_index;
335 }
336 
337 void ResultIterator::MoveToLogicalStartOfWord() {
338  if (word_length_ == 0) {
339  BeginWord(0);
340  return;
341  }
342  GenericVector<int> blob_order;
343  CalculateBlobOrder(&blob_order);
344  if (blob_order.size() == 0 || blob_order[0] == 0) return;
345  BeginWord(blob_order[0]);
346 }
347 
348 bool ResultIterator::IsAtFinalSymbolOfWord() const {
349  if (!it_->word()) return true;
350  GenericVector<int> blob_order;
351  CalculateBlobOrder(&blob_order);
352  return blob_order.size() == 0 || blob_order.back() == blob_index_;
353 }
354 
355 bool ResultIterator::IsAtFirstSymbolOfWord() const {
356  if (!it_->word()) return true;
357  GenericVector<int> blob_order;
358  CalculateBlobOrder(&blob_order);
359  return blob_order.size() == 0 || blob_order[0] == blob_index_;
360 }
361 
362 void ResultIterator::AppendSuffixMarks(STRING *text) const {
363  if (!it_->word()) return;
364  bool reading_direction_is_ltr =
365  current_paragraph_is_ltr_ ^ in_minor_direction_;
366  // scan forward to see what meta-information the word ordering algorithm
367  // left us.
368  // If this word is at the *end* of a minor run, insert the other
369  // direction's mark; else if this was a complex word, insert the
370  // current reading order's mark.
371  GenericVectorEqEq<int> textline_order;
372  CalculateTextlineOrder(current_paragraph_is_ltr_,
373  *this, &textline_order);
374  int this_word_index = LTRWordIndex();
375  int i = textline_order.get_index(this_word_index);
376  if (i < 0) return;
377 
378  int last_non_word_mark = 0;
379  for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
380  last_non_word_mark = textline_order[i];
381  }
382  if (last_non_word_mark == kComplexWord) {
383  *text += reading_direction_is_ltr ? kLRM : kRLM;
384  } else if (last_non_word_mark == kMinorRunEnd) {
385  if (current_paragraph_is_ltr_) {
386  *text += kLRM;
387  } else {
388  *text += kRLM;
389  }
390  }
391 }
392 
393 void ResultIterator::MoveToLogicalStartOfTextline() {
394  GenericVectorEqEq<int> word_indices;
395  RestartRow();
396  CalculateTextlineOrder(current_paragraph_is_ltr_,
397  dynamic_cast<const LTRResultIterator&>(*this),
398  &word_indices);
399  int i = 0;
400  for (; i < word_indices.size() && word_indices[i] < 0; i++) {
401  if (word_indices[i] == kMinorRunStart) in_minor_direction_ = true;
402  else if (word_indices[i] == kMinorRunEnd) in_minor_direction_ = false;
403  }
404  if (in_minor_direction_) at_beginning_of_minor_run_ = true;
405  if (i >= word_indices.size()) return;
406  int first_word_index = word_indices[i];
407  for (int j = 0; j < first_word_index; j++) {
409  }
410  MoveToLogicalStartOfWord();
411 }
412 
415  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
416  in_minor_direction_ = false;
417  at_beginning_of_minor_run_ = false;
418  MoveToLogicalStartOfTextline();
419 }
420 
422  if (it_->block() == NULL) return false; // already at end!
423  switch (level) {
424  case RIL_BLOCK: // explicit fall-through
425  case RIL_PARA: // explicit fall-through
426  case RIL_TEXTLINE:
427  if (!PageIterator::Next(level)) return false;
429  // if we've advanced to a new paragraph,
430  // recalculate current_paragraph_is_ltr_
431  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
432  }
433  in_minor_direction_ = false;
434  MoveToLogicalStartOfTextline();
435  return it_->block() != NULL;
436  case RIL_SYMBOL:
437  {
438  GenericVector<int> blob_order;
439  CalculateBlobOrder(&blob_order);
440  int next_blob = 0;
441  while (next_blob < blob_order.size() &&
442  blob_index_ != blob_order[next_blob])
443  next_blob++;
444  next_blob++;
445  if (next_blob < blob_order.size()) {
446  // we're in the same word; simply advance one blob.
447  BeginWord(blob_order[next_blob]);
448  at_beginning_of_minor_run_ = false;
449  return true;
450  }
451  level = RIL_WORD; // we've fallen through to the next word.
452  }
453  case RIL_WORD: // explicit fall-through.
454  {
455  if (it_->word() == NULL) return Next(RIL_BLOCK);
456  GenericVectorEqEq<int> word_indices;
457  int this_word_index = LTRWordIndex();
458  CalculateTextlineOrder(current_paragraph_is_ltr_,
459  *this,
460  &word_indices);
461  int final_real_index = word_indices.size() - 1;
462  while (final_real_index > 0 && word_indices[final_real_index] < 0)
463  final_real_index--;
464  for (int i = 0; i < final_real_index; i++) {
465  if (word_indices[i] == this_word_index) {
466  int j = i + 1;
467  for (; j < final_real_index && word_indices[j] < 0; j++) {
468  if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
469  if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
470  }
471  at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
472  // awesome, we move to word_indices[j]
473  if (BidiDebug(3)) {
474  tprintf("Next(RIL_WORD): %d -> %d\n",
475  this_word_index, word_indices[j]);
476  }
478  for (int k = 0; k < word_indices[j]; k++) {
480  }
481  MoveToLogicalStartOfWord();
482  return true;
483  }
484  }
485  if (BidiDebug(3)) {
486  tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
487  }
488  // we're going off the end of the text line.
489  return Next(RIL_TEXTLINE);
490  }
491  }
492  ASSERT_HOST(false); // shouldn't happen.
493  return false;
494 }
495 
497  if (it_->block() == NULL) return false; // Already at the end!
498  if (it_->word() == NULL) return true; // In an image block.
499  if (level == RIL_SYMBOL) return true; // Always at beginning of a symbol.
500 
501  bool at_word_start = IsAtFirstSymbolOfWord();
502  if (level == RIL_WORD) return at_word_start;
503 
504  ResultIterator line_start(*this);
505  // move to the first word in the line...
506  line_start.MoveToLogicalStartOfTextline();
507 
508  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
509  if (level == RIL_TEXTLINE) return at_textline_start;
510 
511  // now we move to the left-most word...
512  line_start.RestartRow();
513  bool at_block_start = at_textline_start &&
514  line_start.it_->block() != line_start.it_->prev_block();
515  if (level == RIL_BLOCK) return at_block_start;
516 
517  bool at_para_start = at_block_start ||
518  (at_textline_start &&
519  line_start.it_->row()->row->para() !=
520  line_start.it_->prev_row()->row->para());
521  if (level == RIL_PARA) return at_para_start;
522 
523  ASSERT_HOST(false); // shouldn't happen.
524  return false;
525 }
526 
533  PageIteratorLevel element) const {
534  if (Empty(element)) return true; // Already at the end!
535  // The result is true if we step forward by element and find we are
536  // at the the end of the page or at beginning of *all* levels in:
537  // [level, element).
538  // When there is more than one level difference between element and level,
539  // we could for instance move forward one symbol and still be at the first
540  // word on a line, so we also have to be at the first symbol in a word.
541  ResultIterator next(*this);
542  next.Next(element);
543  if (next.Empty(element)) return true; // Reached the end of the page.
544  while (element > level) {
545  element = static_cast<PageIteratorLevel>(element - 1);
546  if (!next.IsAtBeginningOf(element))
547  return false;
548  }
549  return true;
550 }
551 
557  if (it_->word() == NULL) return NULL; // Already at the end!
558  STRING text;
559  switch (level) {
560  case RIL_BLOCK:
561  {
562  ResultIterator pp(*this);
563  do {
564  pp.AppendUTF8ParagraphText(&text);
565  } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
566  }
567  break;
568  case RIL_PARA:
569  AppendUTF8ParagraphText(&text);
570  break;
571  case RIL_TEXTLINE:
572  {
573  ResultIterator it(*this);
574  it.MoveToLogicalStartOfTextline();
575  it.IterateAndAppendUTF8TextlineText(&text);
576  }
577  break;
578  case RIL_WORD:
579  AppendUTF8WordText(&text);
580  break;
581  case RIL_SYMBOL:
582  {
583  bool reading_direction_is_ltr =
584  current_paragraph_is_ltr_ ^ in_minor_direction_;
585  if (at_beginning_of_minor_run_) {
586  text += reading_direction_is_ltr ? kLRM : kRLM;
587  }
588  text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr);
589  if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
590  }
591  break;
592  }
593  int length = text.length() + 1;
594  char* result = new char[length];
595  strncpy(result, text.string(), length);
596  return result;
597 }
598 
599 void ResultIterator::AppendUTF8WordText(STRING *text) const {
600  if (!it_->word()) return;
602  bool reading_direction_is_ltr =
603  current_paragraph_is_ltr_ ^ in_minor_direction_;
604  if (at_beginning_of_minor_run_) {
605  *text += reading_direction_is_ltr ? kLRM : kRLM;
606  }
607 
608  GenericVector<int> blob_order;
609  CalculateBlobOrder(&blob_order);
610  for (int i = 0; i < blob_order.size(); i++) {
611  *text += it_->word()->BestUTF8(blob_order[i], !reading_direction_is_ltr);
612  }
613  AppendSuffixMarks(text);
614 }
615 
616 void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
617  if (Empty(RIL_WORD)) {
618  Next(RIL_WORD);
619  return;
620  }
621  if (BidiDebug(1)) {
622  GenericVectorEqEq<int> textline_order;
624  CalculateTextlineOrder(current_paragraph_is_ltr_,
625  *this, &dirs, &textline_order);
626  tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(),
627  current_paragraph_is_ltr_ ? "ltr" : "rtl");
628  PrintScriptDirs(dirs);
629  tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
630  current_paragraph_is_ltr_ ? "ltr" : "rtl");
631  for (int i = 0; i < textline_order.size(); i++) {
632  tprintf("%d ", textline_order[i]);
633  }
634  tprintf("\n");
635  }
636 
637  int words_appended = 0;
638  do {
639  int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space()
640  : (words_appended > 0);
641  for (int i = 0; i < numSpaces; ++i) {
642  *text += " ";
643  }
644  AppendUTF8WordText(text);
645  words_appended++;
646  } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
647  if (BidiDebug(1)) {
648  tprintf("%d words printed\n", words_appended);
649  }
650  *text += line_separator_;
651  // If we just finished a paragraph, add an extra newline.
652  if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))
653  *text += paragraph_separator_;
654 }
655 
656 void ResultIterator::AppendUTF8ParagraphText(STRING *text) const {
657  ResultIterator it(*this);
658  it.RestartParagraph();
659  it.MoveToLogicalStartOfTextline();
660  if (it.Empty(RIL_WORD)) return;
661  do {
662  it.IterateAndAppendUTF8TextlineText(text);
663  } while (it.it_->block() != NULL && !it.IsAtBeginningOf(RIL_PARA));
664 }
665 
666 bool ResultIterator::BidiDebug(int min_level) const {
667  int debug_level = 1;
668  IntParam *p = ParamUtils::FindParam<IntParam>(
669  "bidi_debug", GlobalParams()->int_params,
671  if (p != NULL) debug_level = (inT32)(*p);
672  return debug_level >= min_level;
673 }
674 
675 } // namespace tesseract.
StrongScriptDirection
Definition: unichar.h:40
int size() const
Definition: genericvector.h:72
void truncate(int size)
static void CalculateTextlineOrder(bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)
WERD_CHOICE * best_choice
Definition: pageres.h:219
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
T & back() const
bool UnicharsInReadingOrder() const
Definition: pageres.h:406
bool Empty(PageIteratorLevel level) const
virtual bool Next(PageIteratorLevel level)
UNICHARSET::Direction SymbolDirection(int blob_index) const
Definition: pageres.h:364
BLOCK_RES * prev_block() const
Definition: pageres.h:730
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
inT32 length() const
Definition: strngs.cpp:188
virtual char * GetUTF8Text(PageIteratorLevel level) const
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool IsWithinFirstTextlineOfParagraph() const
BLOCK_RES * block() const
Definition: pageres.h:739
TESS_LOCAL void BeginWord(int offset)
static const int kMinorRunEnd
virtual bool IsAtBeginningOf(PageIteratorLevel level) const
static const int kComplexWord
GenericVector< IntParam * > int_params
Definition: params.h:44
ROW_RES * row() const
Definition: pageres.h:736
const char * kLRM
Definition: unicodes.cpp:27
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
const char * kRLM
Definition: unicodes.cpp:28
GenericVector< BoolParam * > bool_params
Definition: params.h:45
PARA * para() const
Definition: ocrrow.h:115
WERD * word
Definition: pageres.h:175
ParamsVectors * params()
Definition: ccutil.h:65
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
ROW * row
Definition: pageres.h:127
uinT8 space()
Definition: werd.h:104
static const int kMinorRunStart
Definition: strngs.h:44
virtual void RestartRow()
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:33
#define NULL
Definition: host.h:144
virtual bool Next(PageIteratorLevel level)
const char * string() const
Definition: strngs.cpp:193
ROW_RES * prev_row() const
Definition: pageres.h:727
int get_index(T object) const
const char *const BestUTF8(int blob_index, bool in_rtl_context) const
Definition: pageres.h:342
WERD_RES * word() const
Definition: pageres.h:733
int inT32
Definition: host.h:102