tesseract  5.0.0-alpha-619-ge9db
paragraphs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: paragraphs.cpp
3  * Description: Paragraph detection for tesseract.
4  * Author: David Eger
5  *
6  * (C) Copyright 2011, Google Inc.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include "paragraphs.h"
20 #include <cctype> // for isspace
21 #include <cmath> // for abs
22 #include <cstdio> // for snprintf
23 #include <cstdlib> // for abs
24 #include <cstring> // for strchr, strlen
25 #include <algorithm> // for max
26 #include <memory> // for unique_ptr
27 #include <tesseract/genericvector.h> // for GenericVector, GenericVectorEqEq
28 #include <tesseract/helpers.h> // for UpdateRange, ClipToRange
29 #include "host.h" // for NearlyEqual
30 #include "mutableiterator.h" // for MutableIterator
31 #include "ocrblock.h" // for BLOCK
32 #include "ocrpara.h" // for ParagraphModel, PARA, PARA_IT, PARA...
33 #include "ocrrow.h" // for ROW
34 #include <tesseract/pageiterator.h> // for PageIterator
35 #include "pageres.h" // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO...
36 #include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels
37 #include "pdblock.h" // for PDBLK
38 #include "polyblk.h" // for POLY_BLOCK
39 #include <tesseract/publictypes.h> // for JUSTIFICATION_LEFT, JUSTIFICATION_R...
40 #include "ratngs.h" // for WERD_CHOICE
41 #include "rect.h" // for TBOX
42 #include "statistc.h" // for STATS
43 #include <tesseract/strngs.h> // for STRING
44 #include "tprintf.h" // for tprintf
45 #include <tesseract/unichar.h> // for UNICHAR, UNICHAR_ID
46 #include "unicharset.h" // for UNICHARSET
47 #include "unicodes.h" // for kPDF, kRLE
48 #include "werd.h" // for WERD, W_REP_CHAR
49 
50 namespace tesseract {
51 
52 // Special "weak" ParagraphModels.
54  = reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F));
56  = reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F));
57 
58 // Do the text and geometry of two rows support a paragraph break between them?
59 static bool LikelyParagraphStart(const RowScratchRegisters &before,
60  const RowScratchRegisters &after,
62 
63 // Given the width of a typical space between words, what is the threshold
64 // by which by which we think left and right alignments for paragraphs
65 // can vary and still be aligned.
66 static int Epsilon(int space_pix) {
67  return space_pix * 4 / 5;
68 }
69 
70 static bool AcceptableRowArgs(
71  int debug_level, int min_num_rows, const char *function_name,
73  int row_start, int row_end) {
74  if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
75  tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n",
76  row_start, row_end, rows->size());
77  return false;
78  }
79  if (row_end - row_start < min_num_rows) {
80  if (debug_level > 1) {
81  tprintf("# Too few rows[%d, %d) for %s.\n",
82  row_start, row_end, function_name);
83  }
84  return false;
85  }
86  return true;
87 }
88 
89 // =============================== Debug Code ================================
90 
91 // Convert an integer to a decimal string.
92 static STRING StrOf(int num) {
93  char buffer[30];
94  snprintf(buffer, sizeof(buffer), "%d", num);
95  return STRING(buffer);
96 }
97 
98 // Given a row-major matrix of unicode text and a column separator, print
99 // a formatted table. For ASCII, we get good column alignment.
100 static void PrintTable(const GenericVector<GenericVector<STRING> > &rows,
101  const STRING &colsep) {
102  GenericVector<int> max_col_widths;
103  for (int r = 0; r < rows.size(); r++) {
104  int num_columns = rows[r].size();
105  for (int c = 0; c < num_columns; c++) {
106  int num_unicodes = 0;
107  for (int i = 0; i < rows[r][c].size(); i++) {
108  if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
109  }
110  if (c >= max_col_widths.size()) {
111  max_col_widths.push_back(num_unicodes);
112  } else {
113  if (num_unicodes > max_col_widths[c])
114  max_col_widths[c] = num_unicodes;
115  }
116  }
117  }
118 
119  GenericVector<STRING> col_width_patterns;
120  for (int c = 0; c < max_col_widths.size(); c++) {
121  col_width_patterns.push_back(
122  STRING("%-") + StrOf(max_col_widths[c]) + "s");
123  }
124 
125  for (int r = 0; r < rows.size(); r++) {
126  for (int c = 0; c < rows[r].size(); c++) {
127  if (c > 0)
128  tprintf("%s", colsep.c_str());
129  tprintf(col_width_patterns[c].c_str(), rows[r][c].c_str());
130  }
131  tprintf("\n");
132  }
133 }
134 
135 static STRING RtlEmbed(const STRING &word, bool rtlify) {
136  if (rtlify)
137  return STRING(kRLE) + word + STRING(kPDF);
138  return word;
139 }
140 
141 // Print the current thoughts of the paragraph detector.
142 static void PrintDetectorState(const ParagraphTheory &theory,
146  output.back().push_back("#row");
147  output.back().push_back("space");
148  output.back().push_back("..");
149  output.back().push_back("lword[widthSEL]");
150  output.back().push_back("rword[widthSEL]");
152  output.back().push_back("text");
153 
154  for (int i = 0; i < rows.size(); i++) {
156  GenericVector<STRING> &row = output.back();
157  const RowInfo& ri = *rows[i].ri_;
158  row.push_back(StrOf(i));
159  row.push_back(StrOf(ri.average_interword_space));
160  row.push_back(ri.has_leaders ? ".." : " ");
161  row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
162  "[" + StrOf(ri.lword_box.width()) +
163  (ri.lword_likely_starts_idea ? "S" : "s") +
164  (ri.lword_likely_ends_idea ? "E" : "e") +
165  (ri.lword_indicates_list_item ? "L" : "l") +
166  "]");
167  row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
168  "[" + StrOf(ri.rword_box.width()) +
169  (ri.rword_likely_starts_idea ? "S" : "s") +
170  (ri.rword_likely_ends_idea ? "E" : "e") +
171  (ri.rword_indicates_list_item ? "L" : "l") +
172  "]");
173  rows[i].AppendDebugInfo(theory, &row);
174  row.push_back(RtlEmbed(ri.text, !ri.ltr));
175  }
176  PrintTable(output, " ");
177 
178  tprintf("Active Paragraph Models:\n");
179  for (int m = 0; m < theory.models().size(); m++) {
180  tprintf(" %d: %s\n", m + 1, theory.models()[m]->ToString().c_str());
181  }
182 }
183 
184 static void DebugDump(
185  bool should_print,
186  const STRING &phase,
187  const ParagraphTheory &theory,
189  if (!should_print)
190  return;
191  tprintf("# %s\n", phase.c_str());
192  PrintDetectorState(theory, rows);
193 }
194 
195 // Print out the text for rows[row_start, row_end)
196 static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows,
197  int row_start, int row_end) {
198  tprintf("======================================\n");
199  for (int row = row_start; row < row_end; row++) {
200  tprintf("%s\n", rows[row].ri_->text.c_str());
201  }
202  tprintf("======================================\n");
203 }
204 
205 // ============= Brain Dead Language Model (ASCII Version) ===================
206 
207 static bool IsLatinLetter(int ch) {
208  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
209 }
210 
211 static bool IsDigitLike(int ch) {
212  return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
213 }
214 
215 static bool IsOpeningPunct(int ch) {
216  return strchr("'\"({[", ch) != nullptr;
217 }
218 
219 static bool IsTerminalPunct(int ch) {
220  return strchr(":'\".?!]})", ch) != nullptr;
221 }
222 
223 // Return a pointer after consuming as much text as qualifies as roman numeral.
224 static const char *SkipChars(const char *str, const char *toskip) {
225  while (*str != '\0' && strchr(toskip, *str)) { str++; }
226  return str;
227 }
228 
229 static const char *SkipChars(const char *str, bool (*skip)(int)) {
230  while (*str != '\0' && skip(*str)) { str++; }
231  return str;
232 }
233 
234 static const char *SkipOne(const char *str, const char *toskip) {
235  if (*str != '\0' && strchr(toskip, *str)) return str + 1;
236  return str;
237 }
238 
239 // Return whether it is very likely that this is a numeral marker that could
240 // start a list item. Some examples include:
241 // A I iii. VI (2) 3.5. [C-4]
242 static bool LikelyListNumeral(const STRING &word) {
243  const char *kRomans = "ivxlmdIVXLMD";
244  const char *kDigits = "012345789";
245  const char *kOpen = "[{(";
246  const char *kSep = ":;-.,";
247  const char *kClose = "]})";
248 
249  int num_segments = 0;
250  const char *pos = word.c_str();
251  while (*pos != '\0' && num_segments < 3) {
252  // skip up to two open parens.
253  const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
254  const char *numeral_end = SkipChars(numeral_start, kRomans);
255  if (numeral_end != numeral_start) {
256  // Got Roman Numeral. Great.
257  } else {
258  numeral_end = SkipChars(numeral_start, kDigits);
259  if (numeral_end == numeral_start) {
260  // If there's a single latin letter, we can use that.
261  numeral_end = SkipChars(numeral_start, IsLatinLetter);
262  if (numeral_end - numeral_start != 1)
263  break;
264  }
265  }
266  // We got some sort of numeral.
267  num_segments++;
268  // Skip any trailing parens or punctuation.
269  pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
270  if (pos == numeral_end)
271  break;
272  }
273  return *pos == '\0';
274 }
275 
276 static bool LikelyListMark(const STRING &word) {
277  const char *kListMarks = "0Oo*.,+.";
278  return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;
279 }
280 
281 bool AsciiLikelyListItem(const STRING &word) {
282  return LikelyListMark(word) || LikelyListNumeral(word);
283 }
284 
285 // ========== Brain Dead Language Model (Tesseract Version) ================
286 
287 // Return the first Unicode Codepoint from werd[pos].
288 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
289  if (!u || !werd || pos > werd->length())
290  return 0;
291  return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
292 }
293 
294 // A useful helper class for finding the first j >= i so that word[j]
295 // does not have given character type.
297  public:
298  UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
299  : u_(unicharset), word_(word) { wordlen_ = word->length(); }
300 
301  // Given an input position, return the first position >= pos not punc.
302  int SkipPunc(int pos);
303  // Given an input position, return the first position >= pos not digit.
304  int SkipDigits(int pos);
305  // Given an input position, return the first position >= pos not roman.
306  int SkipRomans(int pos);
307  // Given an input position, return the first position >= pos not alpha.
308  int SkipAlpha(int pos);
309 
310  private:
311  const UNICHARSET *u_;
312  const WERD_CHOICE *word_;
313  int wordlen_;
314 };
315 
316 int UnicodeSpanSkipper::SkipPunc(int pos) {
317  while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++;
318  return pos;
319 }
320 
321 int UnicodeSpanSkipper::SkipDigits(int pos) {
322  while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) ||
323  IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++;
324  return pos;
325 }
326 
327 int UnicodeSpanSkipper::SkipRomans(int pos) {
328  const char *kRomans = "ivxlmdIVXLMD";
329  while (pos < wordlen_) {
330  int ch = UnicodeFor(u_, word_, pos);
331  if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) break;
332  pos++;
333  }
334  return pos;
335 }
336 
337 int UnicodeSpanSkipper::SkipAlpha(int pos) {
338  while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++;
339  return pos;
340 }
341 
342 static bool LikelyListMarkUnicode(int ch) {
343  if (ch < 0x80) {
344  STRING single_ch;
345  single_ch += ch;
346  return LikelyListMark(single_ch);
347  }
348  switch (ch) {
349  // TODO(eger) expand this list of unicodes as needed.
350  case 0x00B0: // degree sign
351  case 0x2022: // bullet
352  case 0x25E6: // white bullet
353  case 0x00B7: // middle dot
354  case 0x25A1: // white square
355  case 0x25A0: // black square
356  case 0x25AA: // black small square
357  case 0x2B1D: // black very small square
358  case 0x25BA: // black right-pointing pointer
359  case 0x25CF: // black circle
360  case 0x25CB: // white circle
361  return true;
362  default:
363  break; // fall through
364  }
365  return false;
366 }
367 
368 // Return whether it is very likely that this is a numeral marker that could
369 // start a list item. Some examples include:
370 // A I iii. VI (2) 3.5. [C-4]
371 static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
372  if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))
373  return true;
374 
375  UnicodeSpanSkipper m(u, werd);
376  int num_segments = 0;
377  int pos = 0;
378  while (pos < werd->length() && num_segments < 3) {
379  int numeral_start = m.SkipPunc(pos);
380  if (numeral_start > pos + 1) break;
381  int numeral_end = m.SkipRomans(numeral_start);
382  if (numeral_end == numeral_start) {
383  numeral_end = m.SkipDigits(numeral_start);
384  if (numeral_end == numeral_start) {
385  // If there's a single latin letter, we can use that.
386  numeral_end = m.SkipAlpha(numeral_start);
387  if (numeral_end - numeral_start != 1)
388  break;
389  }
390  }
391  // We got some sort of numeral.
392  num_segments++;
393  // Skip any trailing punctuation.
394  pos = m.SkipPunc(numeral_end);
395  if (pos == numeral_end)
396  break;
397  }
398  return pos == werd->length();
399 }
400 
401 // ========= Brain Dead Language Model (combined entry points) ================
402 
403 // Given the leftmost word of a line either as a Tesseract unicharset + werd
404 // or a utf8 string, set the following attributes for it:
405 // is_list - this word might be a list number or bullet.
406 // starts_idea - this word is likely to start a sentence.
407 // ends_idea - this word is likely to end a sentence.
408 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
409  const STRING &utf8,
410  bool *is_list, bool *starts_idea, bool *ends_idea) {
411  *is_list = false;
412  *starts_idea = false;
413  *ends_idea = false;
414  if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) { // Empty
415  *ends_idea = true;
416  return;
417  }
418 
419  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
420  if (UniLikelyListItem(unicharset, werd)) {
421  *is_list = true;
422  *starts_idea = true;
423  *ends_idea = true;
424  }
425  if (unicharset->get_isupper(werd->unichar_id(0))) {
426  *starts_idea = true;
427  }
428  if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
429  *starts_idea = true;
430  *ends_idea = true;
431  }
432  } else { // Assume utf8 is mostly ASCII
433  if (AsciiLikelyListItem(utf8)) {
434  *is_list = true;
435  *starts_idea = true;
436  }
437  int start_letter = utf8[0];
438  if (IsOpeningPunct(start_letter)) {
439  *starts_idea = true;
440  }
441  if (IsTerminalPunct(start_letter)) {
442  *ends_idea = true;
443  }
444  if (start_letter >= 'A' && start_letter <= 'Z') {
445  *starts_idea = true;
446  }
447  }
448 }
449 
450 // Given the rightmost word of a line either as a Tesseract unicharset + werd
451 // or a utf8 string, set the following attributes for it:
452 // is_list - this word might be a list number or bullet.
453 // starts_idea - this word is likely to start a sentence.
454 // ends_idea - this word is likely to end a sentence.
455 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
456  const STRING &utf8,
457  bool *is_list, bool *starts_idea, bool *ends_idea) {
458  *is_list = false;
459  *starts_idea = false;
460  *ends_idea = false;
461  if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) { // Empty
462  *ends_idea = true;
463  return;
464  }
465 
466  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
467  if (UniLikelyListItem(unicharset, werd)) {
468  *is_list = true;
469  *starts_idea = true;
470  }
471  UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
472  if (unicharset->get_ispunctuation(last_letter)) {
473  *ends_idea = true;
474  }
475  } else { // Assume utf8 is mostly ASCII
476  if (AsciiLikelyListItem(utf8)) {
477  *is_list = true;
478  *starts_idea = true;
479  }
480  int last_letter = utf8[utf8.size() - 1];
481  if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
482  *ends_idea = true;
483  }
484  }
485 }
486 
487 // =============== Implementation of RowScratchRegisters =====================
488 /* static */
490  GenericVector<STRING> *header) {
491  header->push_back("[lmarg,lind;rind,rmarg]");
492  header->push_back("model");
493 }
494 
495 void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
496  GenericVector<STRING> *dbg) const {
497  char s[30];
498  snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]",
500  dbg->push_back(s);
501  STRING model_string;
502  model_string += static_cast<char>(GetLineType());
503  model_string += ":";
504 
505  int model_numbers = 0;
506  for (int h = 0; h < hypotheses_.size(); h++) {
507  if (hypotheses_[h].model == nullptr)
508  continue;
509  if (model_numbers > 0)
510  model_string += ",";
511  if (StrongModel(hypotheses_[h].model)) {
512  model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));
513  } else if (hypotheses_[h].model == kCrownLeft) {
514  model_string += "CrL";
515  } else if (hypotheses_[h].model == kCrownRight) {
516  model_string += "CrR";
517  }
518  model_numbers++;
519  }
520  if (model_numbers == 0)
521  model_string += "0";
522 
523  dbg->push_back(model_string);
524 }
525 
526 void RowScratchRegisters::Init(const RowInfo &row) {
527  ri_ = &row;
528  lmargin_ = 0;
529  lindent_ = row.pix_ldistance;
530  rmargin_ = 0;
531  rindent_ = row.pix_rdistance;
532 }
533 
535  if (hypotheses_.empty())
536  return LT_UNKNOWN;
537  bool has_start = false;
538  bool has_body = false;
539  for (int i = 0; i < hypotheses_.size(); i++) {
540  switch (hypotheses_[i].ty) {
541  case LT_START: has_start = true; break;
542  case LT_BODY: has_body = true; break;
543  default:
544  tprintf("Encountered bad value in hypothesis list: %c\n",
545  hypotheses_[i].ty);
546  break;
547  }
548  }
549  if (has_start && has_body)
550  return LT_MULTIPLE;
551  return has_start ? LT_START : LT_BODY;
552 }
553 
555  if (hypotheses_.empty())
556  return LT_UNKNOWN;
557  bool has_start = false;
558  bool has_body = false;
559  for (int i = 0; i < hypotheses_.size(); i++) {
560  if (hypotheses_[i].model != model)
561  continue;
562  switch (hypotheses_[i].ty) {
563  case LT_START: has_start = true; break;
564  case LT_BODY: has_body = true; break;
565  default:
566  tprintf("Encountered bad value in hypothesis list: %c\n",
567  hypotheses_[i].ty);
568  break;
569  }
570  }
571  if (has_start && has_body)
572  return LT_MULTIPLE;
573  return has_start ? LT_START : LT_BODY;
574 }
575 
577  LineType current_lt = GetLineType();
578  if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
579  tprintf("Trying to set a line to be START when it's already BODY.\n");
580  }
581  if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
582  hypotheses_.push_back_new(LineHypothesis(LT_START, nullptr));
583  }
584 }
585 
587  LineType current_lt = GetLineType();
588  if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
589  tprintf("Trying to set a line to be BODY when it's already START.\n");
590  }
591  if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
592  hypotheses_.push_back_new(LineHypothesis(LT_BODY, nullptr));
593  }
594 }
595 
597  hypotheses_.push_back_new(LineHypothesis(LT_START, model));
598  int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, nullptr));
599  if (old_idx >= 0)
600  hypotheses_.remove(old_idx);
601 }
602 
604  hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));
605  int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, nullptr));
606  if (old_idx >= 0)
607  hypotheses_.remove(old_idx);
608 }
609 
611  for (int h = 0; h < hypotheses_.size(); h++) {
612  if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
613  models->push_back_new(hypotheses_[h].model);
614  }
615 }
616 
618  for (int h = 0; h < hypotheses_.size(); h++) {
619  if (StrongModel(hypotheses_[h].model))
620  models->push_back_new(hypotheses_[h].model);
621  }
622 }
623 
625  for (int h = 0; h < hypotheses_.size(); h++) {
626  if (hypotheses_[h].model != nullptr)
627  models->push_back_new(hypotheses_[h].model);
628  }
629 }
630 
632  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START)
633  return nullptr;
634  return hypotheses_[0].model;
635 }
636 
638  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY)
639  return nullptr;
640  return hypotheses_[0].model;
641 }
642 
643 // Discard any hypotheses whose model is not in the given list.
645  const SetOfModels &models) {
646  if (models.empty())
647  return;
648  for (int h = hypotheses_.size() - 1; h >= 0; h--) {
649  if (!models.contains(hypotheses_[h].model)) {
650  hypotheses_.remove(h);
651  }
652  }
653 }
654 
655 // ============ Geometry based Paragraph Detection Algorithm =================
656 
657 struct Cluster {
658  Cluster() : center(0), count(0) {}
659  Cluster(int cen, int num) : center(cen), count(num) {}
660 
661  int center; // The center of the cluster.
662  int count; // The number of entries within the cluster.
663 };
664 
665 class SimpleClusterer {
666  public:
667  explicit SimpleClusterer(int max_cluster_width)
668  : max_cluster_width_(max_cluster_width) {}
669  void Add(int value) { values_.push_back(value); }
670  int size() const { return values_.size(); }
671  void GetClusters(GenericVector<Cluster> *clusters);
672 
673  private:
674  int max_cluster_width_;
675  GenericVectorEqEq<int> values_;
676 };
677 
678 // Return the index of the cluster closest to value.
679 static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
680  int best_index = 0;
681  for (int i = 0; i < clusters.size(); i++) {
682  if (abs(value - clusters[i].center) <
683  abs(value - clusters[best_index].center))
684  best_index = i;
685  }
686  return best_index;
687 }
688 
690  clusters->clear();
691  values_.sort();
692  for (int i = 0; i < values_.size();) {
693  int orig_i = i;
694  int lo = values_[i];
695  int hi = lo;
696  while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
697  hi = values_[i];
698  }
699  clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
700  }
701 }
702 
703 // Calculate left- and right-indent tab stop values seen in
704 // rows[row_start, row_end) given a tolerance of tolerance.
705 static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
706  int row_start, int row_end, int tolerance,
707  GenericVector<Cluster> *left_tabs,
708  GenericVector<Cluster> *right_tabs) {
709  if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
710  return;
711  // First pass: toss all left and right indents into clusterers.
712  SimpleClusterer initial_lefts(tolerance);
713  SimpleClusterer initial_rights(tolerance);
714  GenericVector<Cluster> initial_left_tabs;
715  GenericVector<Cluster> initial_right_tabs;
716  for (int i = row_start; i < row_end; i++) {
717  initial_lefts.Add((*rows)[i].lindent_);
718  initial_rights.Add((*rows)[i].rindent_);
719  }
720  initial_lefts.GetClusters(&initial_left_tabs);
721  initial_rights.GetClusters(&initial_right_tabs);
722 
723  // Second pass: cluster only lines that are not "stray"
724  // An example of a stray line is a page number -- a line whose start
725  // and end tab-stops are far outside the typical start and end tab-stops
726  // for the block.
727  // Put another way, we only cluster data from lines whose start or end
728  // tab stop is frequent.
729  SimpleClusterer lefts(tolerance);
730  SimpleClusterer rights(tolerance);
731 
732  // Outlier elimination. We might want to switch this to test outlier-ness
733  // based on how strange a position an outlier is in instead of or in addition
734  // to how rare it is. These outliers get re-added if we end up having too
735  // few tab stops, to work with, however.
736  int infrequent_enough_to_ignore = 0;
737  if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
738  if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
739 
740  for (int i = row_start; i < row_end; i++) {
741  int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
742  int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
743  if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
744  initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
745  lefts.Add((*rows)[i].lindent_);
746  rights.Add((*rows)[i].rindent_);
747  }
748  }
749  lefts.GetClusters(left_tabs);
750  rights.GetClusters(right_tabs);
751 
752  if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
753  (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
754  // One side is really ragged, and the other only has one tab stop,
755  // so those "insignificant outliers" are probably important, actually.
756  // This often happens on a page of an index. Add back in the ones
757  // we omitted in the first pass.
758  for (int i = row_start; i < row_end; i++) {
759  int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
760  int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
761  if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
762  initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
763  lefts.Add((*rows)[i].lindent_);
764  rights.Add((*rows)[i].rindent_);
765  }
766  }
767  }
768  lefts.GetClusters(left_tabs);
769  rights.GetClusters(right_tabs);
770 
771  // If one side is almost a two-indent aligned side, and the other clearly
772  // isn't, try to prune out the least frequent tab stop from that side.
773  if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
774  int to_prune = -1;
775  for (int i = left_tabs->size() - 1; i >= 0; i--) {
776  if (to_prune < 0 ||
777  (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
778  to_prune = i;
779  }
780  }
781  if (to_prune >= 0 &&
782  (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
783  left_tabs->remove(to_prune);
784  }
785  }
786  if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
787  int to_prune = -1;
788  for (int i = right_tabs->size() - 1; i >= 0; i--) {
789  if (to_prune < 0 ||
790  (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
791  to_prune = i;
792  }
793  }
794  if (to_prune >= 0 &&
795  (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
796  right_tabs->remove(to_prune);
797  }
798  }
799 }
800 
801 // Given a paragraph model mark rows[row_start, row_end) as said model
802 // start or body lines.
803 //
804 // Case 1: model->first_indent_ != model->body_indent_
805 // Differentiating the paragraph start lines from the paragraph body lines in
806 // this case is easy, we just see how far each line is indented.
807 //
808 // Case 2: model->first_indent_ == model->body_indent_
809 // Here, we find end-of-paragraph lines by looking for "short lines."
810 // What constitutes a "short line" changes depending on whether the text
811 // ragged-right[left] or fully justified (aligned left and right).
812 //
813 // Case 2a: Ragged Right (or Left) text. (eop_threshold == 0)
814 // We have a new paragraph it the first word would have at the end
815 // of the previous line.
816 //
817 // Case 2b: Fully Justified. (eop_threshold > 0)
818 // We mark a line as short (end of paragraph) if the offside indent
819 // is greater than eop_threshold.
820 static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows,
821  int row_start, int row_end,
822  const ParagraphModel *model,
823  bool ltr, int eop_threshold) {
824  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
825  return;
826  for (int row = row_start; row < row_end; row++) {
827  bool valid_first = ValidFirstLine(rows, row, model);
828  bool valid_body = ValidBodyLine(rows, row, model);
829  if (valid_first && !valid_body) {
830  (*rows)[row].AddStartLine(model);
831  } else if (valid_body && !valid_first) {
832  (*rows)[row].AddBodyLine(model);
833  } else if (valid_body && valid_first) {
834  bool after_eop = (row == row_start);
835  if (row > row_start) {
836  if (eop_threshold > 0) {
837  if (model->justification() == JUSTIFICATION_LEFT) {
838  after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
839  } else {
840  after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
841  }
842  } else {
843  after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
844  model->justification());
845  }
846  }
847  if (after_eop) {
848  (*rows)[row].AddStartLine(model);
849  } else {
850  (*rows)[row].AddBodyLine(model);
851  }
852  } else {
853  // Do nothing. Stray row.
854  }
855  }
856 }
857 
858 // GeometricClassifierState holds all of the information we'll use while
859 // trying to determine a paragraph model for the text lines in a block of
860 // text:
861 // + the rows under consideration [row_start, row_end)
862 // + the common left- and right-indent tab stops
863 // + does the block start out left-to-right or right-to-left
864 // Further, this struct holds the data we amass for the (single) ParagraphModel
865 // we'll assign to the text lines (assuming we get that far).
866 struct GeometricClassifierState {
867  GeometricClassifierState(int dbg_level,
869  int r_start, int r_end)
870  : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {
871  tolerance = InterwordSpace(*r, r_start, r_end);
872  CalculateTabStops(r, r_start, r_end, tolerance,
873  &left_tabs, &right_tabs);
874  if (debug_level >= 3) {
875  tprintf("Geometry: TabStop cluster tolerance = %d; "
876  "%d left tabs; %d right tabs\n",
877  tolerance, left_tabs.size(), right_tabs.size());
878  }
879  ltr = (*r)[r_start].ri_->ltr;
880  }
881 
884  margin = (*rows)[row_start].lmargin_;
885  }
886 
887  void AssumeRightJustification() {
889  margin = (*rows)[row_start].rmargin_;
890  }
891 
892  // Align tabs are the tab stops the text is aligned to.
893  const GenericVector<Cluster> &AlignTabs() const {
895  return left_tabs;
896  }
897 
898  // Offside tabs are the tab stops opposite the tabs used to align the text.
899  //
900  // Note that for a left-to-right text which is aligned to the right such as
901  // this function comment, the offside tabs are the horizontal tab stops
902  // marking the beginning of ("Note", "this" and "marking").
903  const GenericVector<Cluster> &OffsideTabs() const {
905  return right_tabs;
906  }
907 
908  // Return whether the i'th row extends from the leftmost left tab stop
909  // to the right most right tab stop.
910  bool IsFullRow(int i) const {
911  return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
912  ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
913  }
914 
915  int AlignsideTabIndex(int row_idx) const {
916  return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
917  }
918 
919  // Given what we know about the paragraph justification (just), would the
920  // first word of row_b have fit at the end of row_a?
921  bool FirstWordWouldHaveFit(int row_a, int row_b) {
923  (*rows)[row_a], (*rows)[row_b], just);
924  }
925 
926  void PrintRows() const { PrintRowRange(*rows, row_start, row_end); }
927 
928  void Fail(int min_debug_level, const char *why) const {
929  if (debug_level < min_debug_level) return;
930  tprintf("# %s\n", why);
931  PrintRows();
932  }
933 
934  ParagraphModel Model() const {
936  }
937 
938  // We print out messages with a debug level at least as great as debug_level.
939  int debug_level = 0;
940 
941  // The Geometric Classifier was asked to find a single paragraph model
942  // to fit the text rows (*rows)[row_start, row_end)
944  int row_start = 0;
945  int row_end = 0;
946 
947  // The amount by which we expect the text edge can vary and still be aligned.
948  int tolerance = 0;
949 
950  // Is the script in this text block left-to-right?
951  // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve
952  bool ltr = false;
953 
954  // These left and right tab stops were determined to be the common tab
955  // stops for the given text.
958 
959  // These are parameters we must determine to create a ParagraphModel.
961  int margin = 0;
962  int first_indent = 0;
963  int body_indent = 0;
964 
965  // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel()
966  int eop_threshold = 0;
967 };
968 
969 // Given a section of text where strong textual clues did not help identifying
970 // paragraph breaks, and for which the left and right indents have exactly
971 // three tab stops between them, attempt to find the paragraph breaks based
972 // solely on the outline of the text and whether the script is left-to-right.
973 //
974 // Algorithm Detail:
975 // The selected rows are in the form of a rectangle except
976 // for some number of "short lines" of the same length:
977 //
978 // (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx
979 // xxxxxxxxxxx xxxxxxxxxx # A "short" line.
980 // xxxxxxxxxxxxx xxxxxxxxxxxx
981 // xxxxxxxxxxxxx xxxxxxxxxxxx
982 //
983 // We have a slightly different situation if the only short
984 // line is at the end of the excerpt.
985 //
986 // (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx
987 // xxxxxxxxxxxxx xxxxxxxxxxxx
988 // xxxxxxxxxxxxx xxxxxxxxxxxx
989 // xxxxxxxxxxx xxxxxxxxxx # A "short" line.
990 //
991 // We'll interpret these as follows based on the reasoning in the comment for
992 // GeometricClassify():
993 // [script direction: first indent, body indent]
994 // (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0
995 // (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0
996 static void GeometricClassifyThreeTabStopTextBlock(
997  int debug_level,
999  ParagraphTheory *theory) {
1000  int num_rows = s.row_end - s.row_start;
1001  int num_full_rows = 0;
1002  int last_row_full = 0;
1003  for (int i = s.row_start; i < s.row_end; i++) {
1004  if (s.IsFullRow(i)) {
1005  num_full_rows++;
1006  if (i == s.row_end - 1) last_row_full++;
1007  }
1008  }
1009 
1010  if (num_full_rows < 0.7 * num_rows) {
1011  s.Fail(1, "Not enough full lines to know which lines start paras.");
1012  return;
1013  }
1014 
1015  // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
1016  s.eop_threshold = 0;
1017 
1018  if (s.ltr) {
1019  s.AssumeLeftJustification();
1020  } else {
1021  s.AssumeRightJustification();
1022  }
1023 
1024  if (debug_level > 0) {
1025  tprintf("# Not enough variety for clear outline classification. "
1026  "Guessing these are %s aligned based on script.\n",
1027  s.ltr ? "left" : "right");
1028  s.PrintRows();
1029  }
1030 
1031  if (s.AlignTabs().size() == 2) { // case A1 or A2
1032  s.first_indent = s.AlignTabs()[1].center;
1033  s.body_indent = s.AlignTabs()[0].center;
1034  } else { // case B1 or B2
1035  if (num_rows - 1 == num_full_rows - last_row_full) {
1036  // case B2
1037  const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
1038  (*s.rows)[s.row_start].AddStartLine(model);
1039  for (int i = s.row_start + 1; i < s.row_end; i++) {
1040  (*s.rows)[i].AddBodyLine(model);
1041  }
1042  return;
1043  } else {
1044  // case B1
1045  s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1046  s.eop_threshold = (s.OffsideTabs()[0].center +
1047  s.OffsideTabs()[1].center) / 2;
1048  }
1049  }
1050  const ParagraphModel *model = theory->AddModel(s.Model());
1051  MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
1052  s.ltr, s.eop_threshold);
1053  return;
1054 }
1055 
1056 // This function is called if strong textual clues were not available, but
1057 // the caller hopes that the paragraph breaks will be super obvious just
1058 // by the outline of the text.
1059 //
1060 // The particularly difficult case is figuring out what's going on if you
1061 // don't have enough short paragraph end lines to tell us what's going on.
1062 //
1063 // For instance, let's say you have the following outline:
1064 //
1065 // (A1) xxxxxxxxxxxxxxxxxxxxxx
1066 // xxxxxxxxxxxxxxxxxxxx
1067 // xxxxxxxxxxxxxxxxxxxxxx
1068 // xxxxxxxxxxxxxxxxxxxxxx
1069 //
1070 // Even if we know that the text is left-to-right and so will probably be
1071 // left-aligned, both of the following are possible texts:
1072 //
1073 // (A1a) 1. Here our list item
1074 // with two full lines.
1075 // 2. Here a second item.
1076 // 3. Here our third one.
1077 //
1078 // (A1b) so ends paragraph one.
1079 // Here starts another
1080 // paragraph we want to
1081 // read. This continues
1082 //
1083 // These examples are obvious from the text and should have been caught
1084 // by the StrongEvidenceClassify pass. However, for languages where we don't
1085 // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
1086 // it's worth guessing that (A1b) is the correct interpretation if there are
1087 // far more "full" lines than "short" lines.
1088 static void GeometricClassify(int debug_level,
1090  int row_start, int row_end,
1091  ParagraphTheory *theory) {
1092  if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
1093  return;
1094  if (debug_level > 1) {
1095  tprintf("###############################################\n");
1096  tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n",
1097  row_start, row_end);
1098  tprintf("###############################################\n");
1099  }
1100  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
1101 
1102  GeometricClassifierState s(debug_level, rows, row_start, row_end);
1103  if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
1104  s.Fail(2, "Too much variety for simple outline classification.");
1105  return;
1106  }
1107  if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
1108  s.Fail(1, "Not enough variety for simple outline classification.");
1109  return;
1110  }
1111  if (s.left_tabs.size() + s.right_tabs.size() == 3) {
1112  GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1113  return;
1114  }
1115 
1116  // At this point, we know that one side has at least two tab stops, and the
1117  // other side has one or two tab stops.
1118  // Left to determine:
1119  // (1) Which is the body indent and which is the first line indent?
1120  // (2) Is the text fully justified?
1121 
1122  // If one side happens to have three or more tab stops, assume that side
1123  // is opposite of the aligned side.
1124  if (s.right_tabs.size() > 2) {
1125  s.AssumeLeftJustification();
1126  } else if (s.left_tabs.size() > 2) {
1127  s.AssumeRightJustification();
1128  } else if (s.ltr) { // guess based on script direction
1129  s.AssumeLeftJustification();
1130  } else {
1131  s.AssumeRightJustification();
1132  }
1133 
1134  if (s.AlignTabs().size() == 2) {
1135  // For each tab stop on the aligned side, how many of them appear
1136  // to be paragraph start lines? [first lines]
1137  int firsts[2] = {0, 0};
1138  // Count the first line as a likely paragraph start line.
1139  firsts[s.AlignsideTabIndex(s.row_start)]++;
1140  // For each line, if the first word would have fit on the previous
1141  // line count it as a likely paragraph start line.
1142  bool jam_packed = true;
1143  for (int i = s.row_start + 1; i < s.row_end; i++) {
1144  if (s.FirstWordWouldHaveFit(i - 1, i)) {
1145  firsts[s.AlignsideTabIndex(i)]++;
1146  jam_packed = false;
1147  }
1148  }
1149  // Make an extra accounting for the last line of the paragraph just
1150  // in case it's the only short line in the block. That is, take its
1151  // first word as typical and see if this looks like the *last* line
1152  // of a paragraph. If so, mark the *other* indent as probably a first.
1153  if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
1154  firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
1155  }
1156 
1157  int percent0firsts, percent1firsts;
1158  percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
1159  percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
1160 
1161  // TODO(eger): Tune these constants if necessary.
1162  if ((percent0firsts < 20 && 30 < percent1firsts) ||
1163  percent0firsts + 30 < percent1firsts) {
1164  s.first_indent = s.AlignTabs()[1].center;
1165  s.body_indent = s.AlignTabs()[0].center;
1166  } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1167  percent1firsts + 30 < percent0firsts) {
1168  s.first_indent = s.AlignTabs()[0].center;
1169  s.body_indent = s.AlignTabs()[1].center;
1170  } else {
1171  // Ambiguous! Probably lineated (poetry)
1172  if (debug_level > 1) {
1173  tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
1174  s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
1175  tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1176  s.AlignTabs()[0].center, percent0firsts);
1177  tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1178  s.AlignTabs()[1].center, percent1firsts);
1179  s.PrintRows();
1180  }
1181  return;
1182  }
1183  } else {
1184  // There's only one tab stop for the "aligned to" side.
1185  s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1186  }
1187 
1188  // At this point, we have our model.
1189  const ParagraphModel *model = theory->AddModel(s.Model());
1190 
1191  // Now all we have to do is figure out if the text is fully justified or not.
1192  // eop_threshold: default to fully justified unless we see evidence below.
1193  // See description on MarkRowsWithModel()
1194  s.eop_threshold =
1195  (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1196  // If the text is not fully justified, re-set the eop_threshold to 0.
1197  if (s.AlignTabs().size() == 2) {
1198  // Paragraphs with a paragraph-start indent.
1199  for (int i = s.row_start; i < s.row_end - 1; i++) {
1200  if (ValidFirstLine(s.rows, i + 1, model) &&
1201  !NearlyEqual(s.OffsideTabs()[0].center,
1202  (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1203  // We found a non-end-of-paragraph short line: not fully justified.
1204  s.eop_threshold = 0;
1205  break;
1206  }
1207  }
1208  } else {
1209  // Paragraphs with no paragraph-start indent.
1210  for (int i = s.row_start; i < s.row_end - 1; i++) {
1211  if (!s.FirstWordWouldHaveFit(i, i + 1) &&
1212  !NearlyEqual(s.OffsideTabs()[0].center,
1213  (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1214  // We found a non-end-of-paragraph short line: not fully justified.
1215  s.eop_threshold = 0;
1216  break;
1217  }
1218  }
1219  }
1220  MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
1221 }
1222 
1223 // =============== Implementation of ParagraphTheory =====================
1224 
1226  for (int i = 0; i < models_->size(); i++) {
1227  if ((*models_)[i]->Comparable(model))
1228  return (*models_)[i];
1229  }
1230  auto *m = new ParagraphModel(model);
1231  models_->push_back(m);
1232  models_we_added_.push_back_new(m);
1233  return m;
1234 }
1235 
1236 void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
1237  for (int i = models_->size() - 1; i >= 0; i--) {
1238  ParagraphModel *m = (*models_)[i];
1239  if (!used_models.contains(m) && models_we_added_.contains(m)) {
1240  models_->remove(i);
1241  models_we_added_.remove(models_we_added_.get_index(m));
1242  delete m;
1243  }
1244  }
1245 }
1246 
1247 // Examine rows[start, end) and try to determine if an existing non-centered
1248 // paragraph model would fit them perfectly. If so, return a pointer to it.
1249 // If not, return nullptr.
1251  const GenericVector<RowScratchRegisters> *rows, int start, int end) const {
1252  for (int m = 0; m < models_->size(); m++) {
1253  const ParagraphModel *model = (*models_)[m];
1254  if (model->justification() != JUSTIFICATION_CENTER &&
1255  RowsFitModel(rows, start, end, model))
1256  return model;
1257  }
1258  return nullptr;
1259 }
1260 
1262  for (int m = 0; m < models_->size(); m++) {
1263  const ParagraphModel *model = (*models_)[m];
1264  if (model->justification() != JUSTIFICATION_CENTER)
1266  }
1267 }
1268 
1269 int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
1270  for (int i = 0; i < models_->size(); i++) {
1271  if ((*models_)[i] == model)
1272  return i;
1273  }
1274  return -1;
1275 }
1278  int row, const ParagraphModel *model) {
1279  if (!StrongModel(model)) {
1280  tprintf("ValidFirstLine() should only be called with strong models!\n");
1281  }
1282  return StrongModel(model) &&
1283  model->ValidFirstLine(
1284  (*rows)[row].lmargin_, (*rows)[row].lindent_,
1285  (*rows)[row].rindent_, (*rows)[row].rmargin_);
1286 }
1287 
1289  int row, const ParagraphModel *model) {
1290  if (!StrongModel(model)) {
1291  tprintf("ValidBodyLine() should only be called with strong models!\n");
1292  }
1293  return StrongModel(model) &&
1294  model->ValidBodyLine(
1295  (*rows)[row].lmargin_, (*rows)[row].lindent_,
1296  (*rows)[row].rindent_, (*rows)[row].rmargin_);
1297 }
1298 
1300  int a, int b, const ParagraphModel *model) {
1301  if (model != kCrownRight && model != kCrownLeft) {
1302  tprintf("CrownCompatible() should only be called with crown models!\n");
1303  return false;
1304  }
1305  RowScratchRegisters &row_a = (*rows)[a];
1306  RowScratchRegisters &row_b = (*rows)[b];
1307  if (model == kCrownRight) {
1308  return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
1309  row_b.rindent_ + row_b.rmargin_,
1310  Epsilon(row_a.ri_->average_interword_space));
1311  }
1312  return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
1313  row_b.lindent_ + row_b.lmargin_,
1314  Epsilon(row_a.ri_->average_interword_space));
1315 }
1316 
1317 
1318 // =============== Implementation of ParagraphModelSmearer ====================
1319 
1322  int row_start, int row_end, ParagraphTheory *theory)
1323  : theory_(theory), rows_(rows), row_start_(row_start),
1324  row_end_(row_end) {
1325  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1326  row_start_ = 0;
1327  row_end_ = 0;
1328  return;
1329  }
1330  SetOfModels no_models;
1331  for (int row = row_start - 1; row <= row_end; row++) {
1332  open_models_.push_back(no_models);
1333  }
1334 }
1336 // see paragraphs_internal.h
1337 void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
1338  SetOfModels no_models;
1339  if (row_start < row_start_) row_start = row_start_;
1340  if (row_end > row_end_) row_end = row_end_;
1341 
1342  for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
1343  row++) {
1344  if ((*rows_)[row].ri_->num_words == 0) {
1345  OpenModels(row + 1) = no_models;
1346  } else {
1347  SetOfModels &opened = OpenModels(row);
1348  (*rows_)[row].StartHypotheses(&opened);
1349 
1350  // Which models survive the transition from row to row + 1?
1351  SetOfModels still_open;
1352  for (int m = 0; m < opened.size(); m++) {
1353  if (ValidFirstLine(rows_, row, opened[m]) ||
1354  ValidBodyLine(rows_, row, opened[m])) {
1355  // This is basic filtering; we check likely paragraph starty-ness down
1356  // below in Smear() -- you know, whether the first word would have fit
1357  // and such.
1358  still_open.push_back_new(opened[m]);
1359  }
1360  }
1361  OpenModels(row + 1) = still_open;
1362  }
1363  }
1364 }
1365 
1366 // see paragraphs_internal.h
1367 void ParagraphModelSmearer::Smear() {
1368  CalculateOpenModels(row_start_, row_end_);
1369 
1370  // For each row which we're unsure about (that is, it is LT_UNKNOWN or
1371  // we have multiple LT_START hypotheses), see if there's a model that
1372  // was recently used (an "open" model) which might model it well.
1373  for (int i = row_start_; i < row_end_; i++) {
1374  RowScratchRegisters &row = (*rows_)[i];
1375  if (row.ri_->num_words == 0)
1376  continue;
1377 
1378  // Step One:
1379  // Figure out if there are "open" models which are left-alined or
1380  // right-aligned. This is important for determining whether the
1381  // "first" word in a row would fit at the "end" of the previous row.
1382  bool left_align_open = false;
1383  bool right_align_open = false;
1384  for (int m = 0; m < OpenModels(i).size(); m++) {
1385  switch (OpenModels(i)[m]->justification()) {
1386  case JUSTIFICATION_LEFT: left_align_open = true; break;
1387  case JUSTIFICATION_RIGHT: right_align_open = true; break;
1388  default: left_align_open = right_align_open = true;
1389  }
1390  }
1391  // Step Two:
1392  // Use that knowledge to figure out if this row is likely to
1393  // start a paragraph.
1394  bool likely_start;
1395  if (i == 0) {
1396  likely_start = true;
1397  } else {
1398  if ((left_align_open && right_align_open) ||
1399  (!left_align_open && !right_align_open)) {
1400  likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1401  JUSTIFICATION_LEFT) ||
1402  LikelyParagraphStart((*rows_)[i - 1], row,
1404  } else if (left_align_open) {
1405  likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1407  } else {
1408  likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1410  }
1411  }
1412 
1413  // Step Three:
1414  // If this text line seems like an obvious first line of an
1415  // open model, or an obvious continuation of an existing
1416  // modelled paragraph, mark it up.
1417  if (likely_start) {
1418  // Add Start Hypotheses for all Open models that fit.
1419  for (int m = 0; m < OpenModels(i).size(); m++) {
1420  if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
1421  row.AddStartLine(OpenModels(i)[m]);
1422  }
1423  }
1424  } else {
1425  // Add relevant body line hypotheses.
1426  SetOfModels last_line_models;
1427  if (i > 0) {
1428  (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1429  } else {
1430  theory_->NonCenteredModels(&last_line_models);
1431  }
1432  for (int m = 0; m < last_line_models.size(); m++) {
1433  const ParagraphModel *model = last_line_models[m];
1434  if (ValidBodyLine(rows_, i, model))
1435  row.AddBodyLine(model);
1436  }
1437  }
1438 
1439  // Step Four:
1440  // If we're still quite unsure about this line, go through all
1441  // models in our theory and see if this row could be the start
1442  // of any of our models.
1443  if (row.GetLineType() == LT_UNKNOWN ||
1444  (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
1445  SetOfModels all_models;
1446  theory_->NonCenteredModels(&all_models);
1447  for (int m = 0; m < all_models.size(); m++) {
1448  if (ValidFirstLine(rows_, i, all_models[m])) {
1449  row.AddStartLine(all_models[m]);
1450  }
1451  }
1452  }
1453  // Step Five:
1454  // Since we may have updated the hypotheses about this row, we need
1455  // to recalculate the Open models for the rest of rows[i + 1, row_end)
1456  if (row.GetLineType() != LT_UNKNOWN) {
1457  CalculateOpenModels(i + 1, row_end_);
1458  }
1459  }
1460 }
1461 
1462 // ================ Main Paragraph Detection Algorithm =======================
1463 
1464 // Find out what ParagraphModels are actually used, and discard any
1465 // that are not.
1466 static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
1467  ParagraphTheory *theory) {
1468  SetOfModels used_models;
1469  for (int i = 0; i < rows.size(); i++) {
1470  rows[i].StrongHypotheses(&used_models);
1471  }
1472  theory->DiscardUnusedModels(used_models);
1473 }
1474 
1475 // DowngradeWeakestToCrowns:
1476 // Forget any flush-{left, right} models unless we see two or more
1477 // of them in sequence.
1478 //
1479 // In pass 3, we start to classify even flush-left paragraphs (paragraphs
1480 // where the first line and body indent are the same) as having proper Models.
1481 // This is generally dangerous, since if you start imagining that flush-left
1482 // is a typical paragraph model when it is not, it will lead you to chop normal
1483 // indented paragraphs in the middle whenever a sentence happens to start on a
1484 // new line (see "This" above). What to do?
1485 // What we do is to take any paragraph which is flush left and is not
1486 // preceded by another paragraph of the same model and convert it to a "Crown"
1487 // paragraph. This is a weak pseudo-ParagraphModel which is a placeholder
1488 // for later. It means that the paragraph is flush, but it would be desirable
1489 // to mark it as the same model as following text if it fits. This downgrade
1490 // FlushLeft -> CrownLeft -> Model of following paragraph. Means that we
1491 // avoid making flush left Paragraph Models whenever we see a top-of-the-page
1492 // half-of-a-paragraph. and instead we mark it the same as normal body text.
1493 //
1494 // Implementation:
1495 //
1496 // Comb backwards through the row scratch registers, and turn any
1497 // sequences of body lines of equivalent type abutted against the beginning
1498 // or a body or start line of a different type into a crown paragraph.
1499 static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
1501  int start;
1502  for (int end = rows->size(); end > 0; end = start) {
1503  // Search back for a body line of a unique type.
1504  const ParagraphModel *model = nullptr;
1505  while (end > 0 &&
1506  (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) {
1507  end--;
1508  }
1509  if (end == 0) break;
1510  start = end - 1;
1511  while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1512  start--; // walk back to the first line that is not the same body type.
1513  }
1514  if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
1515  StrongModel(model) &&
1516  NearlyEqual(model->first_indent(), model->body_indent(),
1517  model->tolerance())) {
1518  start--;
1519  }
1520  start++;
1521  // Now rows[start, end) is a sequence of unique body hypotheses of model.
1522  if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
1523  continue;
1524  if (!StrongModel(model)) {
1525  while (start > 0 &&
1526  CrownCompatible(rows, start - 1, start, model))
1527  start--;
1528  }
1529  if (start == 0 ||
1530  (!StrongModel(model)) ||
1531  (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
1532  // crownify rows[start, end)
1533  const ParagraphModel *crown_model = model;
1534  if (StrongModel(model)) {
1535  if (model->justification() == JUSTIFICATION_LEFT)
1536  crown_model = kCrownLeft;
1537  else
1538  crown_model = kCrownRight;
1539  }
1540  (*rows)[start].SetUnknown();
1541  (*rows)[start].AddStartLine(crown_model);
1542  for (int row = start + 1; row < end; row++) {
1543  (*rows)[row].SetUnknown();
1544  (*rows)[row].AddBodyLine(crown_model);
1545  }
1546  }
1547  }
1548  DiscardUnusedModels(*rows, theory);
1549 }
1550 
1551 
1552 // Clear all hypotheses about lines [start, end) and reset margins.
1553 //
1554 // The empty space between the left of a row and the block boundary (and
1555 // similarly for the right) is split into two pieces: margin and indent.
1556 // In initial processing, we assume the block is tight and the margin for
1557 // all lines is set to zero. However, if our first pass does not yield
1558 // models for everything, it may be due to an inset paragraph like a
1559 // block-quote. In that case, we make a second pass over that unmarked
1560 // section of the page and reset the "margin" portion of the empty space
1561 // to the common amount of space at the ends of the lines under consid-
1562 // eration. This would be equivalent to percentile set to 0. However,
1563 // sometimes we have a single character sticking out in the right margin
1564 // of a text block (like the 'r' in 'for' on line 3 above), and we can
1565 // really just ignore it as an outlier. To express this, we allow the
1566 // user to specify the percentile (0..100) of indent values to use as
1567 // the common margin for each row in the run of rows[start, end).
1569  GenericVector<RowScratchRegisters> *rows, int start, int end,
1570  int percentile) {
1571  if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
1572  return;
1573 
1574  int lmin, lmax, rmin, rmax;
1575  lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1576  rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1577  for (int i = start; i < end; i++) {
1578  RowScratchRegisters &sr = (*rows)[i];
1579  sr.SetUnknown();
1580  if (sr.ri_->num_words == 0)
1581  continue;
1582  UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
1583  UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
1584  }
1585  STATS lefts(lmin, lmax + 1);
1586  STATS rights(rmin, rmax + 1);
1587  for (int i = start; i < end; i++) {
1588  RowScratchRegisters &sr = (*rows)[i];
1589  if (sr.ri_->num_words == 0)
1590  continue;
1591  lefts.add(sr.lmargin_ + sr.lindent_, 1);
1592  rights.add(sr.rmargin_ + sr.rindent_, 1);
1593  }
1594  int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
1595  int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
1596  for (int i = start; i < end; i++) {
1597  RowScratchRegisters &sr = (*rows)[i];
1598  int ldelta = ignorable_left - sr.lmargin_;
1599  sr.lmargin_ += ldelta;
1600  sr.lindent_ -= ldelta;
1601  int rdelta = ignorable_right - sr.rmargin_;
1602  sr.rmargin_ += rdelta;
1603  sr.rindent_ -= rdelta;
1604  }
1605 }
1606 
1607 // Return the median inter-word space in rows[row_start, row_end).
1609  int row_start, int row_end) {
1610  if (row_end < row_start + 1) return 1;
1611  int word_height = (rows[row_start].ri_->lword_box.height() +
1612  rows[row_end - 1].ri_->lword_box.height()) / 2;
1613  int word_width = (rows[row_start].ri_->lword_box.width() +
1614  rows[row_end - 1].ri_->lword_box.width()) / 2;
1615  STATS spacing_widths(0, 5 + word_width);
1616  for (int i = row_start; i < row_end; i++) {
1617  if (rows[i].ri_->num_words > 1) {
1618  spacing_widths.add(rows[i].ri_->average_interword_space, 1);
1619  }
1620  }
1621  int minimum_reasonable_space = word_height / 3;
1622  if (minimum_reasonable_space < 2)
1623  minimum_reasonable_space = 2;
1624  int median = spacing_widths.median();
1625  return (median > minimum_reasonable_space)
1626  ? median : minimum_reasonable_space;
1627 }
1628 
1629 // Return whether the first word on the after line can fit in the space at
1630 // the end of the before line (knowing which way the text is aligned and read).
1631 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
1632  const RowScratchRegisters &after,
1633  tesseract::ParagraphJustification justification) {
1634  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1635  return true;
1636 
1637  if (justification == JUSTIFICATION_UNKNOWN) {
1638  tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1639  }
1640  int available_space;
1641  if (justification == JUSTIFICATION_CENTER) {
1642  available_space = before.lindent_ + before.rindent_;
1643  } else {
1644  available_space = before.OffsideIndent(justification);
1645  }
1646  available_space -= before.ri_->average_interword_space;
1647 
1648  if (before.ri_->ltr)
1649  return after.ri_->lword_box.width() < available_space;
1650  return after.ri_->rword_box.width() < available_space;
1651 }
1652 
1653 // Return whether the first word on the after line can fit in the space at
1654 // the end of the before line (not knowing which way the text goes) in a left
1655 // or right alignment.
1656 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
1657  const RowScratchRegisters &after) {
1658  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1659  return true;
1660 
1661  int available_space = before.lindent_;
1662  if (before.rindent_ > available_space)
1663  available_space = before.rindent_;
1664  available_space -= before.ri_->average_interword_space;
1665 
1666  if (before.ri_->ltr)
1667  return after.ri_->lword_box.width() < available_space;
1668  return after.ri_->rword_box.width() < available_space;
1669 }
1670 
1671 static bool TextSupportsBreak(const RowScratchRegisters &before,
1672  const RowScratchRegisters &after) {
1673  if (before.ri_->ltr) {
1674  return before.ri_->rword_likely_ends_idea &&
1676  } else {
1677  return before.ri_->lword_likely_ends_idea &&
1679  }
1680 }
1681 
1682 static bool LikelyParagraphStart(const RowScratchRegisters &before,
1683  const RowScratchRegisters &after,
1685  return before.ri_->num_words == 0 ||
1686  (FirstWordWouldHaveFit(before, after, j) &&
1687  TextSupportsBreak(before, after));
1688 }
1689 
1690 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1691 // would fit them as a single paragraph.
1692 // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
1693 // If the rows given could be a consistent start to a paragraph, set *consistent
1694 // true.
1695 static ParagraphModel InternalParagraphModelByOutline(
1697  int start, int end, int tolerance, bool *consistent) {
1698  int ltr_line_count = 0;
1699  for (int i = start; i < end; i++) {
1700  ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
1701  }
1702  bool ltr = (ltr_line_count >= (end - start) / 2);
1703 
1704  *consistent = true;
1705  if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
1706  return ParagraphModel();
1707 
1708  // Ensure the caller only passed us a region with a common rmargin and
1709  // lmargin.
1710  int lmargin = (*rows)[start].lmargin_;
1711  int rmargin = (*rows)[start].rmargin_;
1712  int lmin, lmax, rmin, rmax, cmin, cmax;
1713  lmin = lmax = (*rows)[start + 1].lindent_;
1714  rmin = rmax = (*rows)[start + 1].rindent_;
1715  cmin = cmax = 0;
1716  for (int i = start + 1; i < end; i++) {
1717  if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1718  tprintf("Margins don't match! Software error.\n");
1719  *consistent = false;
1720  return ParagraphModel();
1721  }
1722  UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
1723  UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
1724  UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1725  }
1726  int ldiff = lmax - lmin;
1727  int rdiff = rmax - rmin;
1728  int cdiff = cmax - cmin;
1729  if (rdiff > tolerance && ldiff > tolerance) {
1730  if (cdiff < tolerance * 2) {
1731  if (end - start < 3)
1732  return ParagraphModel();
1733  return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
1734  }
1735  *consistent = false;
1736  return ParagraphModel();
1737  }
1738  if (end - start < 3) // Don't return a model for two line paras.
1739  return ParagraphModel();
1740 
1741  // These booleans keep us from saying something is aligned left when the body
1742  // left variance is too large.
1743  bool body_admits_left_alignment = ldiff < tolerance;
1744  bool body_admits_right_alignment = rdiff < tolerance;
1745 
1746  ParagraphModel left_model =
1747  ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
1748  (lmin + lmax) / 2, tolerance);
1749  ParagraphModel right_model =
1750  ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
1751  (rmin + rmax) / 2, tolerance);
1752 
1753  // These booleans keep us from having an indent on the "wrong side" for the
1754  // first line.
1755  bool text_admits_left_alignment = ltr || left_model.is_flush();
1756  bool text_admits_right_alignment = !ltr || right_model.is_flush();
1757 
1758  // At least one of the edges is less than tolerance in variance.
1759  // If the other is obviously ragged, it can't be the one aligned to.
1760  // [Note the last line is included in this raggedness.]
1761  if (tolerance < rdiff) {
1762  if (body_admits_left_alignment && text_admits_left_alignment)
1763  return left_model;
1764  *consistent = false;
1765  return ParagraphModel();
1766  }
1767  if (tolerance < ldiff) {
1768  if (body_admits_right_alignment && text_admits_right_alignment)
1769  return right_model;
1770  *consistent = false;
1771  return ParagraphModel();
1772  }
1773 
1774  // At this point, we know the body text doesn't vary much on either side.
1775 
1776  // If the first line juts out oddly in one direction or the other,
1777  // that likely indicates the side aligned to.
1778  int first_left = (*rows)[start].lindent_;
1779  int first_right = (*rows)[start].rindent_;
1780 
1781  if (ltr && body_admits_left_alignment &&
1782  (first_left < lmin || first_left > lmax))
1783  return left_model;
1784  if (!ltr && body_admits_right_alignment &&
1785  (first_right < rmin || first_right > rmax))
1786  return right_model;
1787 
1788  *consistent = false;
1789  return ParagraphModel();
1790 }
1791 
1792 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1793 // would fit them as a single paragraph. If nothing fits,
1794 // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
1795 // output if we're debugging.
1796 static ParagraphModel ParagraphModelByOutline(
1797  int debug_level,
1799  int start, int end, int tolerance) {
1800  bool unused_consistent;
1801  ParagraphModel retval = InternalParagraphModelByOutline(
1802  rows, start, end, tolerance, &unused_consistent);
1803  if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
1804  tprintf("Could not determine a model for this paragraph:\n");
1805  PrintRowRange(*rows, start, end);
1806  }
1807  return retval;
1808 }
1809 
1810 // Do rows[start, end) form a single instance of the given paragraph model?
1812  int start, int end, const ParagraphModel *model) {
1813  if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
1814  return false;
1815  if (!ValidFirstLine(rows, start, model)) return false;
1816  for (int i = start + 1 ; i < end; i++) {
1817  if (!ValidBodyLine(rows, i, model)) return false;
1818  }
1819  return true;
1820 }
1821 
1822 // Examine rows[row_start, row_end) as an independent section of text,
1823 // and mark rows that are exceptionally clear as start-of-paragraph
1824 // and paragraph-body lines.
1825 //
1826 // We presume that any lines surrounding rows[row_start, row_end) may
1827 // have wildly different paragraph models, so we don't key any data off
1828 // of those lines.
1829 //
1830 // We only take the very strongest signals, as we don't want to get
1831 // confused and marking up centered text, poetry, or source code as
1832 // clearly part of a typical paragraph.
1833 static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows,
1834  int row_start, int row_end) {
1835  // Record patently obvious body text.
1836  for (int i = row_start + 1; i < row_end; i++) {
1837  const RowScratchRegisters &prev = (*rows)[i - 1];
1838  RowScratchRegisters &curr = (*rows)[i];
1839  tesseract::ParagraphJustification typical_justification =
1841  if (!curr.ri_->rword_likely_starts_idea &&
1842  !curr.ri_->lword_likely_starts_idea &&
1843  !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
1844  curr.SetBodyLine();
1845  }
1846  }
1847 
1848  // Record patently obvious start paragraph lines.
1849  //
1850  // It's an extremely good signal of the start of a paragraph that
1851  // the first word would have fit on the end of the previous line.
1852  // However, applying just that signal would have us mark random
1853  // start lines of lineated text (poetry and source code) and some
1854  // centered headings as paragraph start lines. Therefore, we use
1855  // a second qualification for a paragraph start: Not only should
1856  // the first word of this line have fit on the previous line,
1857  // but also, this line should go full to the right of the block,
1858  // disallowing a subsequent word from having fit on this line.
1859 
1860  // First row:
1861  {
1862  RowScratchRegisters &curr = (*rows)[row_start];
1863  RowScratchRegisters &next = (*rows)[row_start + 1];
1866  if (curr.GetLineType() == LT_UNKNOWN &&
1867  !FirstWordWouldHaveFit(curr, next, j) &&
1868  (curr.ri_->lword_likely_starts_idea ||
1869  curr.ri_->rword_likely_starts_idea)) {
1870  curr.SetStartLine();
1871  }
1872  }
1873  // Middle rows
1874  for (int i = row_start + 1; i < row_end - 1; i++) {
1875  RowScratchRegisters &prev = (*rows)[i - 1];
1876  RowScratchRegisters &curr = (*rows)[i];
1877  RowScratchRegisters &next = (*rows)[i + 1];
1880  if (curr.GetLineType() == LT_UNKNOWN &&
1881  !FirstWordWouldHaveFit(curr, next, j) &&
1882  LikelyParagraphStart(prev, curr, j)) {
1883  curr.SetStartLine();
1884  }
1885  }
1886  // Last row
1887  { // the short circuit at the top means we have at least two lines.
1888  RowScratchRegisters &prev = (*rows)[row_end - 2];
1889  RowScratchRegisters &curr = (*rows)[row_end - 1];
1892  if (curr.GetLineType() == LT_UNKNOWN &&
1893  !FirstWordWouldHaveFit(curr, curr, j) &&
1894  LikelyParagraphStart(prev, curr, j)) {
1895  curr.SetStartLine();
1896  }
1897  }
1898 }
1899 
1900 // Look for sequences of a start line followed by some body lines in
1901 // rows[row_start, row_end) and create ParagraphModels for them if
1902 // they seem coherent.
1903 static void ModelStrongEvidence(int debug_level,
1905  int row_start, int row_end,
1906  bool allow_flush_models,
1907  ParagraphTheory *theory) {
1908  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1909  return;
1910 
1911  int start = row_start;
1912  while (start < row_end) {
1913  while (start < row_end && (*rows)[start].GetLineType() != LT_START)
1914  start++;
1915  if (start >= row_end - 1)
1916  break;
1917 
1918  int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1919  int end = start;
1920  ParagraphModel last_model;
1921  bool next_consistent;
1922  do {
1923  ++end;
1924  // rows[row, end) was consistent.
1925  // If rows[row, end + 1) is not consistent,
1926  // just model rows[row, end)
1927  if (end < row_end - 1) {
1928  RowScratchRegisters &next = (*rows)[end];
1929  LineType lt = next.GetLineType();
1930  next_consistent = lt == LT_BODY ||
1931  (lt == LT_UNKNOWN &&
1932  !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
1933  } else {
1934  next_consistent = false;
1935  }
1936  if (next_consistent) {
1937  ParagraphModel next_model = InternalParagraphModelByOutline(
1938  rows, start, end + 1, tolerance, &next_consistent);
1939  if (((*rows)[start].ri_->ltr &&
1940  last_model.justification() == JUSTIFICATION_LEFT &&
1941  next_model.justification() != JUSTIFICATION_LEFT) ||
1942  (!(*rows)[start].ri_->ltr &&
1943  last_model.justification() == JUSTIFICATION_RIGHT &&
1944  next_model.justification() != JUSTIFICATION_RIGHT)) {
1945  next_consistent = false;
1946  }
1947  last_model = next_model;
1948  } else {
1949  next_consistent = false;
1950  }
1951  } while (next_consistent && end < row_end);
1952  // At this point, rows[start, end) looked like it could have been a
1953  // single paragraph. If we can make a good ParagraphModel for it,
1954  // do so and mark this sequence with that model.
1955  if (end > start + 1) {
1956  // emit a new paragraph if we have more than one line.
1957  const ParagraphModel *model = nullptr;
1958  ParagraphModel new_model = ParagraphModelByOutline(
1959  debug_level, rows, start, end,
1960  Epsilon(InterwordSpace(*rows, start, end)));
1961  if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
1962  // couldn't create a good model, oh well.
1963  } else if (new_model.is_flush()) {
1964  if (end == start + 2) {
1965  // It's very likely we just got two paragraph starts in a row.
1966  end = start + 1;
1967  } else if (start == row_start) {
1968  // Mark this as a Crown.
1969  if (new_model.justification() == JUSTIFICATION_LEFT) {
1970  model = kCrownLeft;
1971  } else {
1972  model = kCrownRight;
1973  }
1974  } else if (allow_flush_models) {
1975  model = theory->AddModel(new_model);
1976  }
1977  } else {
1978  model = theory->AddModel(new_model);
1979  }
1980  if (model) {
1981  (*rows)[start].AddStartLine(model);
1982  for (int i = start + 1; i < end; i++) {
1983  (*rows)[i].AddBodyLine(model);
1984  }
1985  }
1986  }
1987  start = end;
1988  }
1989 }
1990 
1991 // We examine rows[row_start, row_end) and do the following:
1992 // (1) Clear all existing hypotheses for the rows being considered.
1993 // (2) Mark up any rows as exceptionally likely to be paragraph starts
1994 // or paragraph body lines as such using both geometric and textual
1995 // clues.
1996 // (3) Form models for any sequence of start + continuation lines.
1997 // (4) Smear the paragraph models to cover surrounding text.
1998 static void StrongEvidenceClassify(int debug_level,
2000  int row_start, int row_end,
2001  ParagraphTheory *theory) {
2002  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
2003  return;
2004 
2005  if (debug_level > 1) {
2006  tprintf("#############################################\n");
2007  tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2008  tprintf("#############################################\n");
2009  }
2010 
2011  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
2012  MarkStrongEvidence(rows, row_start, row_end);
2013 
2014  DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
2015 
2016  // Create paragraph models.
2017  ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
2018 
2019  DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
2020 
2021  // At this point, some rows are marked up as paragraphs with model numbers,
2022  // and some rows are marked up as either LT_START or LT_BODY. Now let's
2023  // smear any good paragraph hypotheses forward and backward.
2024  ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2025  smearer.Smear();
2026 }
2027 
2028 static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,
2029  int row_start, int row_end,
2030  ParagraphTheory *theory) {
2031  for (int i = row_start + 1; i < row_end - 1; i++) {
2032  if ((*rows)[i - 1].ri_->has_leaders &&
2033  (*rows)[i].ri_->has_leaders &&
2034  (*rows)[i + 1].ri_->has_leaders) {
2035  const ParagraphModel *model = theory->AddModel(
2036  ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
2037  (*rows)[i].AddStartLine(model);
2038  }
2039  }
2040 }
2041 
2042 // Collect sequences of unique hypotheses in row registers and create proper
2043 // paragraphs for them, referencing the paragraphs in row_owners.
2044 static void ConvertHypothesizedModelRunsToParagraphs(
2045  int debug_level,
2047  GenericVector<PARA *> *row_owners,
2048  ParagraphTheory *theory) {
2049  int end = rows.size();
2050  int start;
2051  for (; end > 0; end = start) {
2052  start = end - 1;
2053  const ParagraphModel *model = nullptr;
2054  // TODO(eger): Be smarter about dealing with multiple hypotheses.
2055  bool single_line_paragraph = false;
2056  SetOfModels models;
2057  rows[start].NonNullHypotheses(&models);
2058  if (!models.empty()) {
2059  model = models[0];
2060  if (rows[start].GetLineType(model) != LT_BODY)
2061  single_line_paragraph = true;
2062  }
2063  if (model && !single_line_paragraph) {
2064  // walk back looking for more body lines and then a start line.
2065  while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
2066  // do nothing
2067  }
2068  if (start < 0 || rows[start].GetLineType(model) != LT_START) {
2069  model = nullptr;
2070  }
2071  }
2072  if (model == nullptr) {
2073  continue;
2074  }
2075  // rows[start, end) should be a paragraph.
2076  PARA *p = new PARA();
2077  if (model == kCrownLeft || model == kCrownRight) {
2079  // Crown paragraph.
2080  // If we can find an existing ParagraphModel that fits, use it,
2081  // else create a new one.
2082  for (int row = end; row < rows.size(); row++) {
2083  if ((*row_owners)[row] &&
2084  (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
2085  (start == 0 ||
2086  ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
2087  model = (*row_owners)[row]->model;
2088  break;
2089  }
2090  }
2091  if (model == kCrownLeft) {
2092  // No subsequent model fits, so cons one up.
2093  model = theory->AddModel(ParagraphModel(
2094  JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
2095  0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2096  } else if (model == kCrownRight) {
2097  // No subsequent model fits, so cons one up.
2098  model = theory->AddModel(ParagraphModel(
2099  JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
2100  0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2101  }
2102  }
2103  rows[start].SetUnknown();
2104  rows[start].AddStartLine(model);
2105  for (int i = start + 1; i < end; i++) {
2106  rows[i].SetUnknown();
2107  rows[i].AddBodyLine(model);
2108  }
2109  p->model = model;
2110  p->has_drop_cap = rows[start].ri_->has_drop_cap;
2111  p->is_list_item =
2113  ? rows[start].ri_->rword_indicates_list_item
2114  : rows[start].ri_->lword_indicates_list_item;
2115  for (int row = start; row < end; row++) {
2116  if ((*row_owners)[row] != nullptr) {
2117  tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2118  "more than once!\n");
2119  delete (*row_owners)[row];
2120  }
2121  (*row_owners)[row] = p;
2122  }
2123  }
2124 }
2125 
2126 struct Interval {
2127  Interval() : begin(0), end(0) {}
2128  Interval(int b, int e) : begin(b), end(e) {}
2129 
2130  int begin;
2131  int end;
2132 };
2133 
2134 // Return whether rows[row] appears to be stranded, meaning that the evidence
2135 // for this row is very weak due to context. For instance, two lines of source
2136 // code may happen to be indented at the same tab vector as body text starts,
2137 // leading us to think they are two start-of-paragraph lines. This is not
2138 // optimal. However, we also don't want to mark a sequence of short dialog
2139 // as "weak," so our heuristic is:
2140 // (1) If a line is surrounded by lines of unknown type, it's weak.
2141 // (2) If two lines in a row are start lines for a given paragraph type, but
2142 // after that the same paragraph type does not continue, they're weak.
2143 static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows,
2144  int row) {
2145  SetOfModels row_models;
2146  rows[row].StrongHypotheses(&row_models);
2147 
2148  for (int m = 0; m < row_models.size(); m++) {
2149  bool all_starts = rows[row].GetLineType();
2150  int run_length = 1;
2151  bool continues = true;
2152  for (int i = row - 1; i >= 0 && continues; i--) {
2153  SetOfModels models;
2154  rows[i].NonNullHypotheses(&models);
2155  switch (rows[i].GetLineType(row_models[m])) {
2156  case LT_START: run_length++; break;
2157  case LT_MULTIPLE: // explicit fall-through
2158  case LT_BODY: run_length++; all_starts = false; break;
2159  case LT_UNKNOWN: // explicit fall-through
2160  default: continues = false;
2161  }
2162  }
2163  continues = true;
2164  for (int i = row + 1; i < rows.size() && continues; i++) {
2165  SetOfModels models;
2166  rows[i].NonNullHypotheses(&models);
2167  switch (rows[i].GetLineType(row_models[m])) {
2168  case LT_START: run_length++; break;
2169  case LT_MULTIPLE: // explicit fall-through
2170  case LT_BODY: run_length++; all_starts = false; break;
2171  case LT_UNKNOWN: // explicit fall-through
2172  default: continues = false;
2173  }
2174  }
2175  if (run_length > 2 || (!all_starts && run_length > 1)) return false;
2176  }
2177  return true;
2178 }
2179 
2180 // Go through rows[row_start, row_end) and gather up sequences that need better
2181 // classification.
2182 // + Sequences of non-empty rows without hypotheses.
2183 // + Crown paragraphs not immediately followed by a strongly modeled line.
2184 // + Single line paragraphs surrounded by text that doesn't match the
2185 // model.
2186 static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
2187  GenericVector<Interval> *to_fix,
2188  int row_start, int row_end) {
2189  to_fix->clear();
2190  for (int i = row_start; i < row_end; i++) {
2191  bool needs_fixing = false;
2192 
2193  SetOfModels models;
2194  SetOfModels models_w_crowns;
2195  rows[i].StrongHypotheses(&models);
2196  rows[i].NonNullHypotheses(&models_w_crowns);
2197  if (models.empty() && !models_w_crowns.empty()) {
2198  // Crown paragraph. Is it followed by a modeled line?
2199  for (int end = i + 1; end < rows.size(); end++) {
2200  SetOfModels end_models;
2201  SetOfModels strong_end_models;
2202  rows[end].NonNullHypotheses(&end_models);
2203  rows[end].StrongHypotheses(&strong_end_models);
2204  if (end_models.empty()) {
2205  needs_fixing = true;
2206  break;
2207  } else if (!strong_end_models.empty()) {
2208  needs_fixing = false;
2209  break;
2210  }
2211  }
2212  } else if (models.empty() && rows[i].ri_->num_words > 0) {
2213  // No models at all.
2214  needs_fixing = true;
2215  }
2216 
2217  if (!needs_fixing && !models.empty()) {
2218  needs_fixing = RowIsStranded(rows, i);
2219  }
2220 
2221  if (needs_fixing) {
2222  if (!to_fix->empty() && to_fix->back().end == i - 1)
2223  to_fix->back().end = i;
2224  else
2225  to_fix->push_back(Interval(i, i));
2226  }
2227  }
2228  // Convert inclusive intervals to half-open intervals.
2229  for (int i = 0; i < to_fix->size(); i++) {
2230  (*to_fix)[i].end = (*to_fix)[i].end + 1;
2231  }
2232 }
2233 
2234 // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
2235 // normalize each row_owner to point to an actual PARA, and output the
2236 // paragraphs in order onto paragraphs.
2238  GenericVector<PARA *> *row_owners,
2239  PARA_LIST *paragraphs) {
2240  GenericVector<PARA *> &rows = *row_owners;
2241  paragraphs->clear();
2242  PARA_IT out(paragraphs);
2243  PARA *formerly_null = nullptr;
2244  for (int i = 0; i < rows.size(); i++) {
2245  if (rows[i] == nullptr) {
2246  if (i == 0 || rows[i - 1] != formerly_null) {
2247  rows[i] = formerly_null = new PARA();
2248  } else {
2249  rows[i] = formerly_null;
2250  continue;
2251  }
2252  } else if (i > 0 && rows[i - 1] == rows[i]) {
2253  continue;
2254  }
2255  out.add_after_then_move(rows[i]);
2256  }
2257 }
2258 
2259 // Main entry point for Paragraph Detection Algorithm.
2260 //
2261 // Given a set of equally spaced textlines (described by row_infos),
2262 // Split them into paragraphs.
2263 //
2264 // Output:
2265 // row_owners - one pointer for each row, to the paragraph it belongs to.
2266 // paragraphs - this is the actual list of PARA objects.
2267 // models - the list of paragraph models referenced by the PARA objects.
2268 // caller is responsible for deleting the models.
2269 void DetectParagraphs(int debug_level,
2270  GenericVector<RowInfo> *row_infos,
2271  GenericVector<PARA *> *row_owners,
2272  PARA_LIST *paragraphs,
2275  ParagraphTheory theory(models);
2276 
2277  // Initialize row_owners to be a bunch of nullptr pointers.
2278  row_owners->init_to_size(row_infos->size(), nullptr);
2279 
2280  // Set up row scratch registers for the main algorithm.
2281  rows.init_to_size(row_infos->size(), RowScratchRegisters());
2282  for (int i = 0; i < row_infos->size(); i++) {
2283  rows[i].Init((*row_infos)[i]);
2284  }
2285 
2286  // Pass 1:
2287  // Detect sequences of lines that all contain leader dots (.....)
2288  // These are likely Tables of Contents. If there are three text lines in
2289  // a row with leader dots, it's pretty safe to say the middle one should
2290  // be a paragraph of its own.
2291  SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
2292 
2293  DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
2294 
2295  GenericVector<Interval> leftovers;
2296  LeftoverSegments(rows, &leftovers, 0, rows.size());
2297  for (int i = 0; i < leftovers.size(); i++) {
2298  // Pass 2a:
2299  // Find any strongly evidenced start-of-paragraph lines. If they're
2300  // followed by two lines that look like body lines, make a paragraph
2301  // model for that and see if that model applies throughout the text
2302  // (that is, "smear" it).
2303  StrongEvidenceClassify(debug_level, &rows,
2304  leftovers[i].begin, leftovers[i].end, &theory);
2305 
2306  // Pass 2b:
2307  // If we had any luck in pass 2a, we got part of the page and didn't
2308  // know how to classify a few runs of rows. Take the segments that
2309  // didn't find a model and reprocess them individually.
2310  GenericVector<Interval> leftovers2;
2311  LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
2312  bool pass2a_was_useful = leftovers2.size() > 1 ||
2313  (leftovers2.size() == 1 &&
2314  (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
2315  if (pass2a_was_useful) {
2316  for (int j = 0; j < leftovers2.size(); j++) {
2317  StrongEvidenceClassify(debug_level, &rows,
2318  leftovers2[j].begin, leftovers2[j].end,
2319  &theory);
2320  }
2321  }
2322  }
2323 
2324  DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
2325 
2326  // Pass 3:
2327  // These are the dregs for which we didn't have enough strong textual
2328  // and geometric clues to form matching models for. Let's see if
2329  // the geometric clues are simple enough that we could just use those.
2330  LeftoverSegments(rows, &leftovers, 0, rows.size());
2331  for (int i = 0; i < leftovers.size(); i++) {
2332  GeometricClassify(debug_level, &rows,
2333  leftovers[i].begin, leftovers[i].end, &theory);
2334  }
2335 
2336  // Undo any flush models for which there's little evidence.
2337  DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2338 
2339  DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
2340 
2341  // Pass 4:
2342  // Take everything that's still not marked up well and clear all markings.
2343  LeftoverSegments(rows, &leftovers, 0, rows.size());
2344  for (int i = 0; i < leftovers.size(); i++) {
2345  for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
2346  rows[j].SetUnknown();
2347  }
2348  }
2349 
2350  DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
2351 
2352  // Convert all of the unique hypothesis runs to PARAs.
2353  ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
2354  &theory);
2355 
2356  DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
2357 
2358  // Finally, clean up any dangling nullptr row paragraph parents.
2359  CanonicalizeDetectionResults(row_owners, paragraphs);
2360 }
2361 
2362 // ============ Code interfacing with the rest of Tesseract ==================
2363 
2364 static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it,
2365  RowInfo *info) {
2366  // Set up text, lword_text, and rword_text (mostly for debug printing).
2367  STRING fake_text;
2368  PageIterator pit(static_cast<const PageIterator&>(it));
2369  bool first_word = true;
2370  if (!pit.Empty(RIL_WORD)) {
2371  do {
2372  fake_text += "x";
2373  if (first_word) info->lword_text += "x";
2374  info->rword_text += "x";
2375  if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
2376  !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
2377  fake_text += " ";
2378  info->rword_text = "";
2379  first_word = false;
2380  }
2381  } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&
2382  pit.Next(RIL_SYMBOL));
2383  }
2384  if (fake_text.size() == 0) return;
2385 
2386  int lspaces = info->pix_ldistance / info->average_interword_space;
2387  for (int i = 0; i < lspaces; i++) {
2388  info->text += ' ';
2389  }
2390  info->text += fake_text;
2391 
2392  // Set up lword_box, rword_box, and num_words.
2393  PAGE_RES_IT page_res_it = *it.PageResIt();
2394  WERD_RES *word_res = page_res_it.restart_row();
2395  ROW_RES *this_row = page_res_it.row();
2396 
2397  WERD_RES *lword = nullptr;
2398  WERD_RES *rword = nullptr;
2399  info->num_words = 0;
2400  do {
2401  if (word_res) {
2402  if (!lword) lword = word_res;
2403  if (rword != word_res) info->num_words++;
2404  rword = word_res;
2405  }
2406  word_res = page_res_it.forward();
2407  } while (page_res_it.row() == this_row);
2408 
2409  if (lword) info->lword_box = lword->word->bounding_box();
2410  if (rword) info->rword_box = rword->word->bounding_box();
2411 }
2412 
2413 
2414 // Given a Tesseract Iterator pointing to a text line, fill in the paragraph
2415 // detector RowInfo with all relevant information from the row.
2416 static void InitializeRowInfo(bool after_recognition,
2417  const MutableIterator &it, RowInfo *info) {
2418  if (it.PageResIt()->row() != nullptr) {
2419  ROW *row = it.PageResIt()->row()->row;
2420  info->pix_ldistance = row->lmargin();
2421  info->pix_rdistance = row->rmargin();
2422  info->average_interword_space =
2423  row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1);
2424  info->pix_xheight = row->x_height();
2425  info->has_leaders = false;
2426  info->has_drop_cap = row->has_drop_cap();
2427  info->ltr = true; // set below depending on word scripts
2428  } else {
2429  info->pix_ldistance = info->pix_rdistance = 0;
2430  info->average_interword_space = 1;
2431  info->pix_xheight = 1.0;
2432  info->has_leaders = false;
2433  info->has_drop_cap = false;
2434  info->ltr = true;
2435  }
2436 
2437  info->num_words = 0;
2438  info->lword_indicates_list_item = false;
2439  info->lword_likely_starts_idea = false;
2440  info->lword_likely_ends_idea = false;
2441  info->rword_indicates_list_item = false;
2442  info->rword_likely_starts_idea = false;
2443  info->rword_likely_ends_idea = false;
2444  info->has_leaders = false;
2445  info->ltr = true;
2446 
2447  if (!after_recognition) {
2448  InitializeTextAndBoxesPreRecognition(it, info);
2449  return;
2450  }
2451  info->text = "";
2452  const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE));
2453  int trailing_ws_idx = strlen(text.get()); // strip trailing space
2454  while (trailing_ws_idx > 0 &&
2455  // isspace() only takes ASCII
2456  isascii(text[trailing_ws_idx - 1]) &&
2457  isspace(text[trailing_ws_idx - 1]))
2458  trailing_ws_idx--;
2459  if (trailing_ws_idx > 0) {
2460  int lspaces = info->pix_ldistance / info->average_interword_space;
2461  for (int i = 0; i < lspaces; i++)
2462  info->text += ' ';
2463  for (int i = 0; i < trailing_ws_idx; i++)
2464  info->text += text[i];
2465  }
2466 
2467  if (info->text.size() == 0) {
2468  return;
2469  }
2470 
2471  PAGE_RES_IT page_res_it = *it.PageResIt();
2473  WERD_RES *word_res = page_res_it.restart_row();
2474  ROW_RES *this_row = page_res_it.row();
2475  int num_leaders = 0;
2476  int ltr = 0;
2477  int rtl = 0;
2478  do {
2479  if (word_res && word_res->best_choice->unichar_string().length() > 0) {
2480  werds.push_back(word_res);
2481  ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
2482  rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
2483  if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
2484  }
2485  word_res = page_res_it.forward();
2486  } while (page_res_it.row() == this_row);
2487  info->ltr = ltr >= rtl;
2488  info->has_leaders = num_leaders > 3;
2489  info->num_words = werds.size();
2490  if (!werds.empty()) {
2491  WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
2492  info->lword_text = lword->best_choice->unichar_string().c_str();
2493  info->rword_text = rword->best_choice->unichar_string().c_str();
2494  info->lword_box = lword->word->bounding_box();
2495  info->rword_box = rword->word->bounding_box();
2496  LeftWordAttributes(lword->uch_set, lword->best_choice,
2497  info->lword_text,
2498  &info->lword_indicates_list_item,
2499  &info->lword_likely_starts_idea,
2500  &info->lword_likely_ends_idea);
2501  RightWordAttributes(rword->uch_set, rword->best_choice,
2502  info->rword_text,
2503  &info->rword_indicates_list_item,
2504  &info->rword_likely_starts_idea,
2505  &info->rword_likely_ends_idea);
2506  }
2507 }
2508 
2509 // This is called after rows have been identified and words are recognized.
2510 // Much of this could be implemented before word recognition, but text helps
2511 // to identify bulleted lists and gives good signals for sentence boundaries.
2512 void DetectParagraphs(int debug_level,
2513  bool after_text_recognition,
2514  const MutableIterator *block_start,
2516  // Clear out any preconceived notions.
2517  if (block_start->Empty(RIL_TEXTLINE)) {
2518  return;
2519  }
2520  BLOCK *block = block_start->PageResIt()->block()->block;
2521  block->para_list()->clear();
2522  bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText();
2523 
2524  // Convert the Tesseract structures to RowInfos
2525  // for the paragraph detection algorithm.
2526  MutableIterator row(*block_start);
2527  if (row.Empty(RIL_TEXTLINE))
2528  return; // end of input already.
2529 
2530  GenericVector<RowInfo> row_infos;
2531  do {
2532  if (!row.PageResIt()->row())
2533  continue; // empty row.
2534  row.PageResIt()->row()->row->set_para(nullptr);
2535  row_infos.push_back(RowInfo());
2536  RowInfo &ri = row_infos.back();
2537  InitializeRowInfo(after_text_recognition, row, &ri);
2538  } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
2539  row.Next(RIL_TEXTLINE));
2540 
2541  // If we're called before text recognition, we might not have
2542  // tight block bounding boxes, so trim by the minimum on each side.
2543  if (!row_infos.empty()) {
2544  int min_lmargin = row_infos[0].pix_ldistance;
2545  int min_rmargin = row_infos[0].pix_rdistance;
2546  for (int i = 1; i < row_infos.size(); i++) {
2547  if (row_infos[i].pix_ldistance < min_lmargin)
2548  min_lmargin = row_infos[i].pix_ldistance;
2549  if (row_infos[i].pix_rdistance < min_rmargin)
2550  min_rmargin = row_infos[i].pix_rdistance;
2551  }
2552  if (min_lmargin > 0 || min_rmargin > 0) {
2553  for (int i = 0; i < row_infos.size(); i++) {
2554  row_infos[i].pix_ldistance -= min_lmargin;
2555  row_infos[i].pix_rdistance -= min_rmargin;
2556  }
2557  }
2558  }
2559 
2560  // Run the paragraph detection algorithm.
2561  GenericVector<PARA *> row_owners;
2562  GenericVector<PARA *> the_paragraphs;
2563  if (!is_image_block) {
2564  DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
2565  models);
2566  } else {
2567  row_owners.init_to_size(row_infos.size(), nullptr);
2568  CanonicalizeDetectionResults(&row_owners, block->para_list());
2569  }
2570 
2571  // Now stitch in the row_owners into the rows.
2572  row = *block_start;
2573  for (int i = 0; i < row_owners.size(); i++) {
2574  while (!row.PageResIt()->row())
2575  row.Next(RIL_TEXTLINE);
2576  row.PageResIt()->row()->row->set_para(row_owners[i]);
2577  row.Next(RIL_TEXTLINE);
2578  }
2579 }
2580 
2581 } // namespace
tesseract::RowScratchRegisters::lindent_
int lindent_
Definition: paragraphs_internal.h:180
ParagraphModel::body_indent
int body_indent() const
Definition: ocrpara.h:169
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
GenericVector::remove
void remove(int index)
Definition: genericvector.h:765
tesseract::FirstWordWouldHaveFit
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
Definition: paragraphs.cpp:1671
ClipToRange
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:106
tesseract::RowScratchRegisters::ri_
const RowInfo * ri_
Definition: paragraphs_internal.h:172
strngs.h
tesseract::ParagraphModelSmearer::ParagraphModelSmearer
ParagraphModelSmearer(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
Definition: paragraphs.cpp:1335
pdblock.h
PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:728
tesseract::RowScratchRegisters::NonNullHypotheses
void NonNullHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:639
tesseract::UnicodeFor
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
Definition: paragraphs.cpp:303
pageres.h
JUSTIFICATION_UNKNOWN
Definition: capi.h:132
tesseract::RowInfo::lword_likely_starts_idea
bool lword_likely_starts_idea
Definition: paragraphs.h:72
tesseract::UnicodeSpanSkipper::SkipDigits
int SkipDigits(int pos)
Definition: paragraphs.cpp:336
tesseract::FirstWordWouldHaveFit
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
Definition: paragraphs.cpp:1646
tesseract::ParagraphTheory::models
GenericVector< ParagraphModel * > & models()
Definition: paragraphs_internal.h:197
WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:116
host.h
tesseract::ValidBodyLine
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
Definition: paragraphs.cpp:1303
tesseract::RowScratchRegisters::AppendDebugInfo
void AppendDebugInfo(const ParagraphTheory &theory, GenericVector< STRING > *dbg) const
Definition: paragraphs.cpp:510
ParagraphModel::ValidBodyLine
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:62
tesseract::JUSTIFICATION_RIGHT
Definition: publictypes.h:252
W_REP_CHAR
repeated character
Definition: werd.h:52
tesseract::SimpleClusterer::size
int size() const
Definition: paragraphs.cpp:685
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:62
ROW::rmargin
int16_t rmargin() const
Definition: ocrrow.h:103
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
WERD_CHOICE
Definition: ratngs.h:261
RIL_WORD
Definition: capi.h:104
NearlyEqual
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:36
WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:147
tesseract::GeometricClassifierState::row_end
int row_end
Definition: paragraphs.cpp:960
tesseract::GeometricClassifierState::debug_level
int debug_level
Definition: paragraphs.cpp:954
PARA::is_list_item
bool is_list_item
Definition: ocrpara.h:38
tesseract::GeometricClassifierState::body_indent
int body_indent
Definition: paragraphs.cpp:978
tesseract::GeometricClassifierState::GeometricClassifierState
GeometricClassifierState(int dbg_level, GenericVector< RowScratchRegisters > *r, int r_start, int r_end)
Definition: paragraphs.cpp:882
tesseract::RowScratchRegisters::StrongHypotheses
void StrongHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:632
PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:751
tesseract::LeftWordAttributes
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:423
tesseract::GeometricClassifierState::IsFullRow
bool IsFullRow(int i) const
Definition: paragraphs.cpp:925
tesseract::ParagraphJustification
ParagraphJustification
Definition: publictypes.h:248
tesseract::UnicodeSpanSkipper::SkipPunc
int SkipPunc(int pos)
Definition: paragraphs.cpp:331
tesseract::Cluster::count
int count
Definition: paragraphs.cpp:677
STRING
Definition: strngs.h:45
PARA::has_drop_cap
bool has_drop_cap
Definition: ocrpara.h:46
polyblk.h
tesseract::RecomputeMarginsAndClearHypotheses
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
Definition: paragraphs.cpp:1583
tesseract::RowScratchRegisters::rmargin_
int rmargin_
Definition: paragraphs_internal.h:182
tesseract::RowInfo::lword_likely_ends_idea
bool lword_likely_ends_idea
Definition: paragraphs.h:73
WERD_RES
Definition: pageres.h:160
tesseract::SimpleClusterer::SimpleClusterer
SimpleClusterer(int max_cluster_width)
Definition: paragraphs.cpp:682
GenericVector::contains
bool contains(const T &object) const
Definition: genericvector.h:793
tesseract::CanonicalizeDetectionResults
void CanonicalizeDetectionResults(GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs)
Definition: paragraphs.cpp:2252
tesseract::GeometricClassifierState::rows
GenericVector< RowScratchRegisters > * rows
Definition: paragraphs.cpp:958
tesseract::RowScratchRegisters::SetBodyLine
void SetBodyLine()
Definition: paragraphs.cpp:601
tesseract::DetectParagraphs
void DetectParagraphs(int debug_level, bool after_text_recognition, const MutableIterator *block_start, GenericVector< ParagraphModel * > *models)
Definition: paragraphs.cpp:2527
JUSTIFICATION_LEFT
Definition: capi.h:133
tesseract::RowScratchRegisters::UniqueBodyHypothesis
const ParagraphModel * UniqueBodyHypothesis() const
Definition: paragraphs.cpp:652
rect.h
ParagraphModel::ValidFirstLine
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:45
tesseract::kRLE
const char *const kRLE
Right-to-Left Embedding.
Definition: unicodes.cpp:40
tesseract::GeometricClassifierState::margin
int margin
Definition: paragraphs.cpp:976
tesseract::RightWordAttributes
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:470
tesseract::RowInfo::rword_likely_starts_idea
bool rword_likely_starts_idea
Definition: paragraphs.h:76
tesseract::ParagraphTheory
Definition: paragraphs_internal.h:191
RIL_SYMBOL
Definition: capi.h:105
ParagraphModel
Definition: ocrpara.h:114
GenericVector::back
T & back() const
Definition: genericvector.h:728
ROW::lmargin
int16_t lmargin() const
Definition: ocrrow.h:100
tesseract::RowInfo
Definition: paragraphs.h:40
tesseract::RowScratchRegisters::lmargin_
int lmargin_
Definition: paragraphs_internal.h:179
UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:509
ratngs.h
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
ParagraphModel::first_indent
int first_indent() const
Definition: ocrpara.h:168
werd.h
statistc.h
mutableiterator.h
genericvector.h
STRING::size
int32_t size() const
Definition: strngs.h:68
WERD_RES::AnyLtrCharsInWord
bool AnyLtrCharsInWord() const
Definition: pageres.h:403
tesseract::JUSTIFICATION_LEFT
Definition: publictypes.h:250
tesseract::CrownCompatible
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
Definition: paragraphs.cpp:1314
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
BLOCK
Definition: ocrblock.h:28
tesseract::RowInfo::rword_likely_ends_idea
bool rword_likely_ends_idea
Definition: paragraphs.h:77
ParagraphModel::tolerance
int tolerance() const
Definition: ocrpara.h:170
tesseract::GeometricClassifierState
Definition: paragraphs.cpp:881
BLOCK::pdblk
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:189
PAGE_RES_IT::restart_row
WERD_RES * restart_row()
Definition: pageres.cpp:1623
ROW::x_height
float x_height() const
Definition: ocrrow.h:63
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
ROW::space
int32_t space() const
Definition: ocrrow.h:78
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
unicharset.h
PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:54
tesseract::JUSTIFICATION_UNKNOWN
Definition: publictypes.h:249
tesseract::RowScratchRegisters::AddBodyLine
void AddBodyLine(const ParagraphModel *model)
Definition: paragraphs.cpp:618
tesseract::UnicodeSpanSkipper::SkipAlpha
int SkipAlpha(int pos)
Definition: paragraphs.cpp:352
tesseract::ParagraphTheory::AddModel
const ParagraphModel * AddModel(const ParagraphModel &model)
Definition: paragraphs.cpp:1240
publictypes.h
tesseract::kPDF
const char *const kPDF
Pop Directional Formatting.
Definition: unicodes.cpp:41
tesseract::GeometricClassifierState::FirstWordWouldHaveFit
bool FirstWordWouldHaveFit(int row_a, int row_b)
Definition: paragraphs.cpp:936
tesseract::RowScratchRegisters::rindent_
int rindent_
Definition: paragraphs_internal.h:181
tesseract::GeometricClassifierState::AssumeRightJustification
void AssumeRightJustification()
Definition: paragraphs.cpp:902
tesseract::ParagraphTheory::Fits
const ParagraphModel * Fits(const GenericVector< RowScratchRegisters > *rows, int start, int end) const
Definition: paragraphs.cpp:1265
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
tesseract::RowsFitModel
bool RowsFitModel(const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
Definition: paragraphs.cpp:1826
UNICHARSET
Definition: unicharset.h:145
tesseract::RowScratchRegisters::Init
void Init(const RowInfo &row)
Definition: paragraphs.cpp:541
WERD_RES::AnyRtlCharsInWord
bool AnyRtlCharsInWord() const
Definition: pageres.h:387
tesseract::ParagraphTheory::DiscardUnusedModels
void DiscardUnusedModels(const SetOfModels &used_models)
Definition: paragraphs.cpp:1251
tesseract::GeometricClassifierState::just
tesseract::ParagraphJustification just
Definition: paragraphs.cpp:975
tesseract::GeometricClassifierState::row_start
int row_start
Definition: paragraphs.cpp:959
pageiterator.h
tesseract::AsciiLikelyListItem
bool AsciiLikelyListItem(const STRING &word)
Definition: paragraphs.cpp:296
BLOCK::para_list
PARA_LIST * para_list()
Definition: ocrblock.h:123
GenericVector::get_index
int get_index(const T &object) const
Definition: genericvector.h:781
helpers.h
tesseract::GeometricClassifierState::Fail
void Fail(int min_debug_level, const char *why) const
Definition: paragraphs.cpp:943
ROW::has_drop_cap
bool has_drop_cap() const
Definition: ocrrow.h:110
tesseract
Definition: baseapi.h:65
tesseract::UnicodeSpanSkipper::SkipRomans
int SkipRomans(int pos)
Definition: paragraphs.cpp:342
tesseract::SimpleClusterer::Add
void Add(int value)
Definition: paragraphs.cpp:684
ocrpara.h
unicodes.h
RIL_TEXTLINE
Definition: capi.h:103
tesseract::RowInfo::num_words
int num_words
Definition: paragraphs.h:54
tesseract::RowScratchRegisters
Definition: paragraphs_internal.h:102
tesseract::kCrownLeft
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:69
STATS
Definition: statistc.h:30
GenericVectorEqEq
Definition: genericvector.h:640
tprintf.h
tesseract::GeometricClassifierState::eop_threshold
int eop_threshold
Definition: paragraphs.cpp:981
tesseract::RowScratchRegisters::AppendDebugHeaderFields
static void AppendDebugHeaderFields(GenericVector< STRING > *header)
Definition: paragraphs.cpp:504
tesseract::LT_UNKNOWN
Definition: paragraphs_internal.h:52
paragraphs.h
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
tesseract::RowScratchRegisters::DiscardNonMatchingHypotheses
void DiscardNonMatchingHypotheses(const SetOfModels &models)
Definition: paragraphs.cpp:659
JUSTIFICATION_CENTER
Definition: capi.h:134
GenericVector
Definition: baseapi.h:40
PAGE_RES_IT
Definition: pageres.h:668
tesseract::SimpleClusterer::GetClusters
void GetClusters(GenericVector< Cluster > *clusters)
Definition: paragraphs.cpp:704
ROW::set_para
void set_para(PARA *p)
Definition: ocrrow.h:114
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:495
JUSTIFICATION_RIGHT
Definition: capi.h:135
tesseract::GeometricClassifierState::ltr
bool ltr
Definition: paragraphs.cpp:967
tesseract::GeometricClassifierState::left_tabs
GenericVector< Cluster > left_tabs
Definition: paragraphs.cpp:971
STRING::length
int32_t length() const
Definition: strngs.cpp:187
tesseract::GeometricClassifierState::first_indent
int first_indent
Definition: paragraphs.cpp:977
tesseract::GeometricClassifierState::AlignTabs
const GenericVector< Cluster > & AlignTabs() const
Definition: paragraphs.cpp:908
tesseract::GeometricClassifierState::tolerance
int tolerance
Definition: paragraphs.cpp:963
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
count
int count(LIST var_list)
Definition: oldlist.cpp:79
tesseract::Cluster::center
int center
Definition: paragraphs.cpp:676
tesseract::UnicodeSpanSkipper
Definition: paragraphs.cpp:311
ROW_RES
Definition: pageres.h:133
ParagraphModel::is_flush
bool is_flush() const
Definition: ocrpara.h:171
ocrrow.h
ParagraphModel::justification
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:164
tesseract::RowScratchRegisters::SetStartLine
void SetStartLine()
Definition: paragraphs.cpp:591
unichar.h
ROW
Definition: ocrrow.h:35
tesseract::LineHypothesis
Definition: paragraphs_internal.h:74
ocrblock.h
tesseract::GeometricClassifierState::OffsideTabs
const GenericVector< Cluster > & OffsideTabs() const
Definition: paragraphs.cpp:918
tesseract::GeometricClassifierState::AssumeLeftJustification
void AssumeLeftJustification()
Definition: paragraphs.cpp:897
GenericVector::clear
void clear()
Definition: genericvector.h:857
tesseract::GeometricClassifierState::Model
ParagraphModel Model() const
Definition: paragraphs.cpp:949
tesseract::ParagraphTheory::IndexOf
int IndexOf(const ParagraphModel *model) const
Definition: paragraphs.cpp:1284
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
tesseract::LT_BODY
Definition: paragraphs_internal.h:51
PARA::model
const ParagraphModel * model
Definition: ocrpara.h:36
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::RowScratchRegisters::UniqueStartHypothesis
const ParagraphModel * UniqueStartHypothesis() const
Definition: paragraphs.cpp:646
PARA
Definition: ocrpara.h:29
PARA::is_very_first_or_continuation
bool is_very_first_or_continuation
Definition: ocrpara.h:43
tesseract::RowScratchRegisters::GetLineType
LineType GetLineType() const
Definition: paragraphs.cpp:549
tesseract::RowInfo::ltr
bool ltr
Definition: paragraphs.h:44
paragraphs_internal.h
WERD_RES::word
WERD * word
Definition: pageres.h:180
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
tesseract::UnicodeSpanSkipper::UnicodeSpanSkipper
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
Definition: paragraphs.cpp:313
tesseract::LT_START
Definition: paragraphs_internal.h:50
UpdateRange
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:118
RIL_BLOCK
Definition: capi.h:101
tesseract::RowScratchRegisters::StartHypotheses
void StartHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:625
GenericVector::sort
void sort()
Definition: genericvector.h:1102
tesseract::LT_MULTIPLE
Definition: paragraphs_internal.h:53
GenericVector::push_back_new
int push_back_new(const T &object)
Definition: genericvector.h:810
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::ParagraphTheory::NonCenteredModels
void NonCenteredModels(SetOfModels *models)
Definition: paragraphs.cpp:1276
tesseract::SetOfModels
GenericVectorEqEq< const ParagraphModel * > SetOfModels
Definition: paragraphs_internal.h:98
tesseract::InterwordSpace
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
Definition: paragraphs.cpp:1623
tesseract::GeometricClassifierState::right_tabs
GenericVector< Cluster > right_tabs
Definition: paragraphs.cpp:972
tesseract::GeometricClassifierState::PrintRows
void PrintRows() const
Definition: paragraphs.cpp:941
tesseract::JUSTIFICATION_CENTER
Definition: publictypes.h:251
tesseract::RowScratchRegisters::AddStartLine
void AddStartLine(const ParagraphModel *model)
Definition: paragraphs.cpp:611
tesseract::LineType
LineType
Definition: paragraphs_internal.h:49
tesseract::GeometricClassifierState::AlignsideTabIndex
int AlignsideTabIndex(int row_idx) const
Definition: paragraphs.cpp:930
tesseract::kCrownRight
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:71
tesseract::Cluster::Cluster
Cluster()
Definition: paragraphs.cpp:673
tesseract::StrongModel
bool StrongModel(const ParagraphModel *model)
Definition: paragraphs_internal.h:70
tesseract::ValidFirstLine
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
Definition: paragraphs.cpp:1292
tesseract::SimpleClusterer
Definition: paragraphs.cpp:680