tesseract  4.0.0-1-g2a2b
paragraphs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: paragraphs.cpp
3  * Description: Paragraph detection for tesseract.
4  * Author: David Eger
5  * Created: 25 February 2011
6  *
7  * (C) Copyright 2011, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "paragraphs.h"
21 #include <cctype> // for isspace
22 #include <cmath> // for abs
23 #include <cstdio> // for snprintf
24 #include <cstdlib> // for abs
25 #include <cstring> // for strchr, strlen
26 #include <algorithm> // for max
27 #include <memory> // for unique_ptr
28 #include "genericvector.h" // for GenericVector, GenericVectorEqEq
29 #include "helpers.h" // for UpdateRange, ClipToRange
30 #include "host.h" // for NearlyEqual
31 #include "mutableiterator.h" // for MutableIterator
32 #include "ocrblock.h" // for BLOCK
33 #include "ocrpara.h" // for ParagraphModel, PARA, PARA_IT, PARA...
34 #include "ocrrow.h" // for ROW
35 #include "pageiterator.h" // for PageIterator
36 #include "pageres.h" // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO...
37 #include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels
38 #include "pdblock.h" // for PDBLK
39 #include "polyblk.h" // for POLY_BLOCK
40 #include "publictypes.h" // for JUSTIFICATION_LEFT, JUSTIFICATION_R...
41 #include "ratngs.h" // for WERD_CHOICE
42 #include "rect.h" // for TBOX
43 #include "statistc.h" // for STATS
44 #include "strngs.h" // for STRING
45 #include "tprintf.h" // for tprintf
46 #include "unichar.h" // for UNICHAR, UNICHAR_ID
47 #include "unicharset.h" // for UNICHARSET
48 #include "unicodes.h" // for kPDF, kRLE
49 #include "werd.h" // for WERD, W_REP_CHAR
50 
51 namespace tesseract {
52 
53 // Special "weak" ParagraphModels.
55  = reinterpret_cast<ParagraphModel *>(0xDEAD111F);
57  = reinterpret_cast<ParagraphModel *>(0xDEAD888F);
58 
59 // Do the text and geometry of two rows support a paragraph break between them?
60 static bool LikelyParagraphStart(const RowScratchRegisters &before,
61  const RowScratchRegisters &after,
63 
64 // Given the width of a typical space between words, what is the threshold
65 // by which by which we think left and right alignments for paragraphs
66 // can vary and still be aligned.
67 static int Epsilon(int space_pix) {
68  return space_pix * 4 / 5;
69 }
70 
71 static bool AcceptableRowArgs(
72  int debug_level, int min_num_rows, const char *function_name,
74  int row_start, int row_end) {
75  if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
76  tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n",
77  row_start, row_end, rows->size());
78  return false;
79  }
80  if (row_end - row_start < min_num_rows) {
81  if (debug_level > 1) {
82  tprintf("# Too few rows[%d, %d) for %s.\n",
83  row_start, row_end, function_name);
84  }
85  return false;
86  }
87  return true;
88 }
89 
90 // =============================== Debug Code ================================
91 
92 // Convert an integer to a decimal string.
93 static STRING StrOf(int num) {
94  char buffer[30];
95  snprintf(buffer, sizeof(buffer), "%d", num);
96  return STRING(buffer);
97 }
98 
99 // Given a row-major matrix of unicode text and a column separator, print
100 // a formatted table. For ASCII, we get good column alignment.
101 static void PrintTable(const GenericVector<GenericVector<STRING> > &rows,
102  const STRING &colsep) {
103  GenericVector<int> max_col_widths;
104  for (int r = 0; r < rows.size(); r++) {
105  int num_columns = rows[r].size();
106  for (int c = 0; c < num_columns; c++) {
107  int num_unicodes = 0;
108  for (int i = 0; i < rows[r][c].size(); i++) {
109  if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
110  }
111  if (c >= max_col_widths.size()) {
112  max_col_widths.push_back(num_unicodes);
113  } else {
114  if (num_unicodes > max_col_widths[c])
115  max_col_widths[c] = num_unicodes;
116  }
117  }
118  }
119 
120  GenericVector<STRING> col_width_patterns;
121  for (int c = 0; c < max_col_widths.size(); c++) {
122  col_width_patterns.push_back(
123  STRING("%-") + StrOf(max_col_widths[c]) + "s");
124  }
125 
126  for (int r = 0; r < rows.size(); r++) {
127  for (int c = 0; c < rows[r].size(); c++) {
128  if (c > 0)
129  tprintf("%s", colsep.string());
130  tprintf(col_width_patterns[c].string(), rows[r][c].string());
131  }
132  tprintf("\n");
133  }
134 }
135 
136 static STRING RtlEmbed(const STRING &word, bool rtlify) {
137  if (rtlify)
138  return STRING(kRLE) + word + STRING(kPDF);
139  return word;
140 }
141 
142 // Print the current thoughts of the paragraph detector.
143 static void PrintDetectorState(const ParagraphTheory &theory,
147  output.back().push_back("#row");
148  output.back().push_back("space");
149  output.back().push_back("..");
150  output.back().push_back("lword[widthSEL]");
151  output.back().push_back("rword[widthSEL]");
153  output.back().push_back("text");
154 
155  for (int i = 0; i < rows.size(); i++) {
157  GenericVector<STRING> &row = output.back();
158  const RowInfo& ri = *rows[i].ri_;
159  row.push_back(StrOf(i));
160  row.push_back(StrOf(ri.average_interword_space));
161  row.push_back(ri.has_leaders ? ".." : " ");
162  row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
163  "[" + StrOf(ri.lword_box.width()) +
164  (ri.lword_likely_starts_idea ? "S" : "s") +
165  (ri.lword_likely_ends_idea ? "E" : "e") +
166  (ri.lword_indicates_list_item ? "L" : "l") +
167  "]");
168  row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
169  "[" + StrOf(ri.rword_box.width()) +
170  (ri.rword_likely_starts_idea ? "S" : "s") +
171  (ri.rword_likely_ends_idea ? "E" : "e") +
172  (ri.rword_indicates_list_item ? "L" : "l") +
173  "]");
174  rows[i].AppendDebugInfo(theory, &row);
175  row.push_back(RtlEmbed(ri.text, !ri.ltr));
176  }
177  PrintTable(output, " ");
178 
179  tprintf("Active Paragraph Models:\n");
180  for (int m = 0; m < theory.models().size(); m++) {
181  tprintf(" %d: %s\n", m + 1, theory.models()[m]->ToString().string());
182  }
183 }
184 
185 static void DebugDump(
186  bool should_print,
187  const STRING &phase,
188  const ParagraphTheory &theory,
190  if (!should_print)
191  return;
192  tprintf("# %s\n", phase.string());
193  PrintDetectorState(theory, rows);
194 }
195 
196 // Print out the text for rows[row_start, row_end)
197 static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows,
198  int row_start, int row_end) {
199  tprintf("======================================\n");
200  for (int row = row_start; row < row_end; row++) {
201  tprintf("%s\n", rows[row].ri_->text.string());
202  }
203  tprintf("======================================\n");
204 }
205 
206 // ============= Brain Dead Language Model (ASCII Version) ===================
207 
208 static bool IsLatinLetter(int ch) {
209  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
210 }
211 
212 static bool IsDigitLike(int ch) {
213  return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
214 }
215 
216 static bool IsOpeningPunct(int ch) {
217  return strchr("'\"({[", ch) != nullptr;
218 }
219 
220 static bool IsTerminalPunct(int ch) {
221  return strchr(":'\".?!]})", ch) != nullptr;
222 }
223 
224 // Return a pointer after consuming as much text as qualifies as roman numeral.
225 static const char *SkipChars(const char *str, const char *toskip) {
226  while (*str != '\0' && strchr(toskip, *str)) { str++; }
227  return str;
228 }
229 
230 static const char *SkipChars(const char *str, bool (*skip)(int)) {
231  while (*str != '\0' && skip(*str)) { str++; }
232  return str;
233 }
234 
235 static const char *SkipOne(const char *str, const char *toskip) {
236  if (*str != '\0' && strchr(toskip, *str)) return str + 1;
237  return str;
238 }
239 
240 // Return whether it is very likely that this is a numeral marker that could
241 // start a list item. Some examples include:
242 // A I iii. VI (2) 3.5. [C-4]
243 static bool LikelyListNumeral(const STRING &word) {
244  const char *kRomans = "ivxlmdIVXLMD";
245  const char *kDigits = "012345789";
246  const char *kOpen = "[{(";
247  const char *kSep = ":;-.,";
248  const char *kClose = "]})";
249 
250  int num_segments = 0;
251  const char *pos = word.string();
252  while (*pos != '\0' && num_segments < 3) {
253  // skip up to two open parens.
254  const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
255  const char *numeral_end = SkipChars(numeral_start, kRomans);
256  if (numeral_end != numeral_start) {
257  // Got Roman Numeral. Great.
258  } else {
259  numeral_end = SkipChars(numeral_start, kDigits);
260  if (numeral_end == numeral_start) {
261  // If there's a single latin letter, we can use that.
262  numeral_end = SkipChars(numeral_start, IsLatinLetter);
263  if (numeral_end - numeral_start != 1)
264  break;
265  }
266  }
267  // We got some sort of numeral.
268  num_segments++;
269  // Skip any trailing parens or punctuation.
270  pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
271  if (pos == numeral_end)
272  break;
273  }
274  return *pos == '\0';
275 }
276 
277 static bool LikelyListMark(const STRING &word) {
278  const char *kListMarks = "0Oo*.,+.";
279  return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;
280 }
281 
282 bool AsciiLikelyListItem(const STRING &word) {
283  return LikelyListMark(word) || LikelyListNumeral(word);
284 }
285 
286 // ========== Brain Dead Language Model (Tesseract Version) ================
287 
288 // Return the first Unicode Codepoint from werd[pos].
289 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
290  if (!u || !werd || pos > werd->length())
291  return 0;
292  return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
293 }
294 
295 // A useful helper class for finding the first j >= i so that word[j]
296 // does not have given character type.
298  public:
299  UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
300  : u_(unicharset), word_(word) { wordlen_ = word->length(); }
301 
302  // Given an input position, return the first position >= pos not punc.
303  int SkipPunc(int pos);
304  // Given an input position, return the first position >= pos not digit.
305  int SkipDigits(int pos);
306  // Given an input position, return the first position >= pos not roman.
307  int SkipRomans(int pos);
308  // Given an input position, return the first position >= pos not alpha.
309  int SkipAlpha(int pos);
310 
311  private:
312  const UNICHARSET *u_;
313  const WERD_CHOICE *word_;
314  int wordlen_;
315 };
316 
318  while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++;
319  return pos;
320 }
321 
323  while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) ||
324  IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++;
325  return pos;
326 }
327 
329  const char *kRomans = "ivxlmdIVXLMD";
330  while (pos < wordlen_) {
331  int ch = UnicodeFor(u_, word_, pos);
332  if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) break;
333  pos++;
334  }
335  return pos;
336 }
337 
339  while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++;
340  return pos;
341 }
342 
343 static bool LikelyListMarkUnicode(int ch) {
344  if (ch < 0x80) {
345  STRING single_ch;
346  single_ch += ch;
347  return LikelyListMark(single_ch);
348  }
349  switch (ch) {
350  // TODO(eger) expand this list of unicodes as needed.
351  case 0x00B0: // degree sign
352  case 0x2022: // bullet
353  case 0x25E6: // white bullet
354  case 0x00B7: // middle dot
355  case 0x25A1: // white square
356  case 0x25A0: // black square
357  case 0x25AA: // black small square
358  case 0x2B1D: // black very small square
359  case 0x25BA: // black right-pointing pointer
360  case 0x25CF: // black circle
361  case 0x25CB: // white circle
362  return true;
363  default:
364  break; // fall through
365  }
366  return false;
367 }
368 
369 // Return whether it is very likely that this is a numeral marker that could
370 // start a list item. Some examples include:
371 // A I iii. VI (2) 3.5. [C-4]
372 static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
373  if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))
374  return true;
375 
376  UnicodeSpanSkipper m(u, werd);
377  int num_segments = 0;
378  int pos = 0;
379  while (pos < werd->length() && num_segments < 3) {
380  int numeral_start = m.SkipPunc(pos);
381  if (numeral_start > pos + 1) break;
382  int numeral_end = m.SkipRomans(numeral_start);
383  if (numeral_end == numeral_start) {
384  numeral_end = m.SkipDigits(numeral_start);
385  if (numeral_end == numeral_start) {
386  // If there's a single latin letter, we can use that.
387  numeral_end = m.SkipAlpha(numeral_start);
388  if (numeral_end - numeral_start != 1)
389  break;
390  }
391  }
392  // We got some sort of numeral.
393  num_segments++;
394  // Skip any trailing punctuation.
395  pos = m.SkipPunc(numeral_end);
396  if (pos == numeral_end)
397  break;
398  }
399  return pos == werd->length();
400 }
401 
402 // ========= Brain Dead Language Model (combined entry points) ================
403 
404 // Given the leftmost word of a line either as a Tesseract unicharset + werd
405 // or a utf8 string, set the following attributes for it:
406 // is_list - this word might be a list number or bullet.
407 // starts_idea - this word is likely to start a sentence.
408 // ends_idea - this word is likely to end a sentence.
409 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
410  const STRING &utf8,
411  bool *is_list, bool *starts_idea, bool *ends_idea) {
412  *is_list = false;
413  *starts_idea = false;
414  *ends_idea = false;
415  if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) { // Empty
416  *ends_idea = true;
417  return;
418  }
419 
420  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
421  if (UniLikelyListItem(unicharset, werd)) {
422  *is_list = true;
423  *starts_idea = true;
424  *ends_idea = true;
425  }
426  if (unicharset->get_isupper(werd->unichar_id(0))) {
427  *starts_idea = true;
428  }
429  if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
430  *starts_idea = true;
431  *ends_idea = true;
432  }
433  } else { // Assume utf8 is mostly ASCII
434  if (AsciiLikelyListItem(utf8)) {
435  *is_list = true;
436  *starts_idea = true;
437  }
438  int start_letter = utf8[0];
439  if (IsOpeningPunct(start_letter)) {
440  *starts_idea = true;
441  }
442  if (IsTerminalPunct(start_letter)) {
443  *ends_idea = true;
444  }
445  if (start_letter >= 'A' && start_letter <= 'Z') {
446  *starts_idea = true;
447  }
448  }
449 }
450 
451 // Given the rightmost word of a line either as a Tesseract unicharset + werd
452 // or a utf8 string, set the following attributes for it:
453 // is_list - this word might be a list number or bullet.
454 // starts_idea - this word is likely to start a sentence.
455 // ends_idea - this word is likely to end a sentence.
456 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
457  const STRING &utf8,
458  bool *is_list, bool *starts_idea, bool *ends_idea) {
459  *is_list = false;
460  *starts_idea = false;
461  *ends_idea = false;
462  if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) { // Empty
463  *ends_idea = true;
464  return;
465  }
466 
467  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
468  if (UniLikelyListItem(unicharset, werd)) {
469  *is_list = true;
470  *starts_idea = true;
471  }
472  UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
473  if (unicharset->get_ispunctuation(last_letter)) {
474  *ends_idea = true;
475  }
476  } else { // Assume utf8 is mostly ASCII
477  if (AsciiLikelyListItem(utf8)) {
478  *is_list = true;
479  *starts_idea = true;
480  }
481  int last_letter = utf8[utf8.size() - 1];
482  if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
483  *ends_idea = true;
484  }
485  }
486 }
487 
488 // =============== Implementation of RowScratchRegisters =====================
489 /* static */
491  GenericVector<STRING> *header) {
492  header->push_back("[lmarg,lind;rind,rmarg]");
493  header->push_back("model");
494 }
495 
497  GenericVector<STRING> *dbg) const {
498  char s[30];
499  snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]",
501  dbg->push_back(s);
502  STRING model_string;
503  model_string += static_cast<char>(GetLineType());
504  model_string += ":";
505 
506  int model_numbers = 0;
507  for (int h = 0; h < hypotheses_.size(); h++) {
508  if (hypotheses_[h].model == nullptr)
509  continue;
510  if (model_numbers > 0)
511  model_string += ",";
512  if (StrongModel(hypotheses_[h].model)) {
513  model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));
514  } else if (hypotheses_[h].model == kCrownLeft) {
515  model_string += "CrL";
516  } else if (hypotheses_[h].model == kCrownRight) {
517  model_string += "CrR";
518  }
519  model_numbers++;
520  }
521  if (model_numbers == 0)
522  model_string += "0";
523 
524  dbg->push_back(model_string);
525 }
526 
528  ri_ = &row;
529  lmargin_ = 0;
530  lindent_ = row.pix_ldistance;
531  rmargin_ = 0;
532  rindent_ = row.pix_rdistance;
533 }
534 
536  if (hypotheses_.empty())
537  return LT_UNKNOWN;
538  bool has_start = false;
539  bool has_body = false;
540  for (int i = 0; i < hypotheses_.size(); i++) {
541  switch (hypotheses_[i].ty) {
542  case LT_START: has_start = true; break;
543  case LT_BODY: has_body = true; break;
544  default:
545  tprintf("Encountered bad value in hypothesis list: %c\n",
546  hypotheses_[i].ty);
547  break;
548  }
549  }
550  if (has_start && has_body)
551  return LT_MULTIPLE;
552  return has_start ? LT_START : LT_BODY;
553 }
554 
556  if (hypotheses_.empty())
557  return LT_UNKNOWN;
558  bool has_start = false;
559  bool has_body = false;
560  for (int i = 0; i < hypotheses_.size(); i++) {
561  if (hypotheses_[i].model != model)
562  continue;
563  switch (hypotheses_[i].ty) {
564  case LT_START: has_start = true; break;
565  case LT_BODY: has_body = true; break;
566  default:
567  tprintf("Encountered bad value in hypothesis list: %c\n",
568  hypotheses_[i].ty);
569  break;
570  }
571  }
572  if (has_start && has_body)
573  return LT_MULTIPLE;
574  return has_start ? LT_START : LT_BODY;
575 }
576 
578  LineType current_lt = GetLineType();
579  if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
580  tprintf("Trying to set a line to be START when it's already BODY.\n");
581  }
582  if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
583  hypotheses_.push_back_new(LineHypothesis(LT_START, nullptr));
584  }
585 }
586 
588  LineType current_lt = GetLineType();
589  if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
590  tprintf("Trying to set a line to be BODY when it's already START.\n");
591  }
592  if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
593  hypotheses_.push_back_new(LineHypothesis(LT_BODY, nullptr));
594  }
595 }
596 
598  hypotheses_.push_back_new(LineHypothesis(LT_START, model));
599  int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, nullptr));
600  if (old_idx >= 0)
601  hypotheses_.remove(old_idx);
602 }
603 
605  hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));
606  int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, nullptr));
607  if (old_idx >= 0)
608  hypotheses_.remove(old_idx);
609 }
610 
612  for (int h = 0; h < hypotheses_.size(); h++) {
613  if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
614  models->push_back_new(hypotheses_[h].model);
615  }
616 }
617 
619  for (int h = 0; h < hypotheses_.size(); h++) {
620  if (StrongModel(hypotheses_[h].model))
621  models->push_back_new(hypotheses_[h].model);
622  }
623 }
624 
626  for (int h = 0; h < hypotheses_.size(); h++) {
627  if (hypotheses_[h].model != nullptr)
628  models->push_back_new(hypotheses_[h].model);
629  }
630 }
631 
633  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START)
634  return nullptr;
635  return hypotheses_[0].model;
636 }
637 
639  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY)
640  return nullptr;
641  return hypotheses_[0].model;
642 }
643 
644 // Discard any hypotheses whose model is not in the given list.
646  const SetOfModels &models) {
647  if (models.empty())
648  return;
649  for (int h = hypotheses_.size() - 1; h >= 0; h--) {
650  if (!models.contains(hypotheses_[h].model)) {
651  hypotheses_.remove(h);
652  }
653  }
654 }
655 
656 // ============ Geometry based Paragraph Detection Algorithm =================
657 
658 struct Cluster {
659  Cluster() : center(0), count(0) {}
660  Cluster(int cen, int num) : center(cen), count(num) {}
661 
662  int center; // The center of the cluster.
663  int count; // The number of entries within the cluster.
664 };
665 
667  public:
668  explicit SimpleClusterer(int max_cluster_width)
669  : max_cluster_width_(max_cluster_width) {}
670  void Add(int value) { values_.push_back(value); }
671  int size() const { return values_.size(); }
672  void GetClusters(GenericVector<Cluster> *clusters);
673 
674  private:
675  int max_cluster_width_;
676  GenericVectorEqEq<int> values_;
677 };
678 
679 // Return the index of the cluster closest to value.
680 static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
681  int best_index = 0;
682  for (int i = 0; i < clusters.size(); i++) {
683  if (abs(value - clusters[i].center) <
684  abs(value - clusters[best_index].center))
685  best_index = i;
686  }
687  return best_index;
688 }
689 
691  clusters->clear();
692  values_.sort();
693  for (int i = 0; i < values_.size();) {
694  int orig_i = i;
695  int lo = values_[i];
696  int hi = lo;
697  while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
698  hi = values_[i];
699  }
700  clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
701  }
702 }
703 
704 // Calculate left- and right-indent tab stop values seen in
705 // rows[row_start, row_end) given a tolerance of tolerance.
706 static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
707  int row_start, int row_end, int tolerance,
708  GenericVector<Cluster> *left_tabs,
709  GenericVector<Cluster> *right_tabs) {
710  if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
711  return;
712  // First pass: toss all left and right indents into clusterers.
713  SimpleClusterer initial_lefts(tolerance);
714  SimpleClusterer initial_rights(tolerance);
715  GenericVector<Cluster> initial_left_tabs;
716  GenericVector<Cluster> initial_right_tabs;
717  for (int i = row_start; i < row_end; i++) {
718  initial_lefts.Add((*rows)[i].lindent_);
719  initial_rights.Add((*rows)[i].rindent_);
720  }
721  initial_lefts.GetClusters(&initial_left_tabs);
722  initial_rights.GetClusters(&initial_right_tabs);
723 
724  // Second pass: cluster only lines that are not "stray"
725  // An example of a stray line is a page number -- a line whose start
726  // and end tab-stops are far outside the typical start and end tab-stops
727  // for the block.
728  // Put another way, we only cluster data from lines whose start or end
729  // tab stop is frequent.
730  SimpleClusterer lefts(tolerance);
731  SimpleClusterer rights(tolerance);
732 
733  // Outlier elimination. We might want to switch this to test outlier-ness
734  // based on how strange a position an outlier is in instead of or in addition
735  // to how rare it is. These outliers get re-added if we end up having too
736  // few tab stops, to work with, however.
737  int infrequent_enough_to_ignore = 0;
738  if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
739  if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
740 
741  for (int i = row_start; i < row_end; i++) {
742  int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
743  int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
744  if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
745  initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
746  lefts.Add((*rows)[i].lindent_);
747  rights.Add((*rows)[i].rindent_);
748  }
749  }
750  lefts.GetClusters(left_tabs);
751  rights.GetClusters(right_tabs);
752 
753  if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
754  (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
755  // One side is really ragged, and the other only has one tab stop,
756  // so those "insignificant outliers" are probably important, actually.
757  // This often happens on a page of an index. Add back in the ones
758  // we omitted in the first pass.
759  for (int i = row_start; i < row_end; i++) {
760  int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
761  int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
762  if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
763  initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
764  lefts.Add((*rows)[i].lindent_);
765  rights.Add((*rows)[i].rindent_);
766  }
767  }
768  }
769  lefts.GetClusters(left_tabs);
770  rights.GetClusters(right_tabs);
771 
772  // If one side is almost a two-indent aligned side, and the other clearly
773  // isn't, try to prune out the least frequent tab stop from that side.
774  if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
775  int to_prune = -1;
776  for (int i = left_tabs->size() - 1; i >= 0; i--) {
777  if (to_prune < 0 ||
778  (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
779  to_prune = i;
780  }
781  }
782  if (to_prune >= 0 &&
783  (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
784  left_tabs->remove(to_prune);
785  }
786  }
787  if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
788  int to_prune = -1;
789  for (int i = right_tabs->size() - 1; i >= 0; i--) {
790  if (to_prune < 0 ||
791  (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
792  to_prune = i;
793  }
794  }
795  if (to_prune >= 0 &&
796  (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
797  right_tabs->remove(to_prune);
798  }
799  }
800 }
801 
802 // Given a paragraph model mark rows[row_start, row_end) as said model
803 // start or body lines.
804 //
805 // Case 1: model->first_indent_ != model->body_indent_
806 // Differentiating the paragraph start lines from the paragraph body lines in
807 // this case is easy, we just see how far each line is indented.
808 //
809 // Case 2: model->first_indent_ == model->body_indent_
810 // Here, we find end-of-paragraph lines by looking for "short lines."
811 // What constitutes a "short line" changes depending on whether the text
812 // ragged-right[left] or fully justified (aligned left and right).
813 //
814 // Case 2a: Ragged Right (or Left) text. (eop_threshold == 0)
815 // We have a new paragraph it the first word would have at the end
816 // of the previous line.
817 //
818 // Case 2b: Fully Justified. (eop_threshold > 0)
819 // We mark a line as short (end of paragraph) if the offside indent
820 // is greater than eop_threshold.
821 static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows,
822  int row_start, int row_end,
823  const ParagraphModel *model,
824  bool ltr, int eop_threshold) {
825  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
826  return;
827  for (int row = row_start; row < row_end; row++) {
828  bool valid_first = ValidFirstLine(rows, row, model);
829  bool valid_body = ValidBodyLine(rows, row, model);
830  if (valid_first && !valid_body) {
831  (*rows)[row].AddStartLine(model);
832  } else if (valid_body && !valid_first) {
833  (*rows)[row].AddBodyLine(model);
834  } else if (valid_body && valid_first) {
835  bool after_eop = (row == row_start);
836  if (row > row_start) {
837  if (eop_threshold > 0) {
838  if (model->justification() == JUSTIFICATION_LEFT) {
839  after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
840  } else {
841  after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
842  }
843  } else {
844  after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
845  model->justification());
846  }
847  }
848  if (after_eop) {
849  (*rows)[row].AddStartLine(model);
850  } else {
851  (*rows)[row].AddBodyLine(model);
852  }
853  } else {
854  // Do nothing. Stray row.
855  }
856  }
857 }
858 
859 // GeometricClassifierState holds all of the information we'll use while
860 // trying to determine a paragraph model for the text lines in a block of
861 // text:
862 // + the rows under consideration [row_start, row_end)
863 // + the common left- and right-indent tab stops
864 // + does the block start out left-to-right or right-to-left
865 // Further, this struct holds the data we amass for the (single) ParagraphModel
866 // we'll assign to the text lines (assuming we get that far).
870  int r_start, int r_end)
871  : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end),
872  margin(0) {
873  tolerance = InterwordSpace(*r, r_start, r_end);
874  CalculateTabStops(r, r_start, r_end, tolerance,
875  &left_tabs, &right_tabs);
876  if (debug_level >= 3) {
877  tprintf("Geometry: TabStop cluster tolerance = %d; "
878  "%d left tabs; %d right tabs\n",
879  tolerance, left_tabs.size(), right_tabs.size());
880  }
881  ltr = (*r)[r_start].ri_->ltr;
882  }
883 
886  margin = (*rows)[row_start].lmargin_;
887  }
888 
891  margin = (*rows)[row_start].rmargin_;
892  }
893 
894  // Align tabs are the tab stops the text is aligned to.
897  return left_tabs;
898  }
899 
900  // Offside tabs are the tab stops opposite the tabs used to align the text.
901  //
902  // Note that for a left-to-right text which is aligned to the right such as
903  // this function comment, the offside tabs are the horizontal tab stops
904  // marking the beginning of ("Note", "this" and "marking").
907  return right_tabs;
908  }
909 
910  // Return whether the i'th row extends from the leftmost left tab stop
911  // to the right most right tab stop.
912  bool IsFullRow(int i) const {
913  return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
914  ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
915  }
916 
917  int AlignsideTabIndex(int row_idx) const {
918  return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
919  }
920 
921  // Given what we know about the paragraph justification (just), would the
922  // first word of row_b have fit at the end of row_a?
923  bool FirstWordWouldHaveFit(int row_a, int row_b) {
925  (*rows)[row_a], (*rows)[row_b], just);
926  }
927 
928  void PrintRows() const { PrintRowRange(*rows, row_start, row_end); }
929 
930  void Fail(int min_debug_level, const char *why) const {
931  if (debug_level < min_debug_level) return;
932  tprintf("# %s\n", why);
933  PrintRows();
934  }
935 
938  }
939 
940  // We print out messages with a debug level at least as great as debug_level.
942 
943  // The Geometric Classifier was asked to find a single paragraph model
944  // to fit the text rows (*rows)[row_start, row_end)
947  int row_end;
948 
949  // The amount by which we expect the text edge can vary and still be aligned.
951 
952  // Is the script in this text block left-to-right?
953  // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve
954  bool ltr;
955 
956  // These left and right tab stops were determined to be the common tab
957  // stops for the given text.
960 
961  // These are parameters we must determine to create a ParagraphModel.
963  int margin;
966 
967  // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel()
969 };
970 
971 // Given a section of text where strong textual clues did not help identifying
972 // paragraph breaks, and for which the left and right indents have exactly
973 // three tab stops between them, attempt to find the paragraph breaks based
974 // solely on the outline of the text and whether the script is left-to-right.
975 //
976 // Algorithm Detail:
977 // The selected rows are in the form of a rectangle except
978 // for some number of "short lines" of the same length:
979 //
980 // (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx
981 // xxxxxxxxxxx xxxxxxxxxx # A "short" line.
982 // xxxxxxxxxxxxx xxxxxxxxxxxx
983 // xxxxxxxxxxxxx xxxxxxxxxxxx
984 //
985 // We have a slightly different situation if the only short
986 // line is at the end of the excerpt.
987 //
988 // (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx
989 // xxxxxxxxxxxxx xxxxxxxxxxxx
990 // xxxxxxxxxxxxx xxxxxxxxxxxx
991 // xxxxxxxxxxx xxxxxxxxxx # A "short" line.
992 //
993 // We'll interpret these as follows based on the reasoning in the comment for
994 // GeometricClassify():
995 // [script direction: first indent, body indent]
996 // (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0
997 // (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0
998 static void GeometricClassifyThreeTabStopTextBlock(
999  int debug_level,
1001  ParagraphTheory *theory) {
1002  int num_rows = s.row_end - s.row_start;
1003  int num_full_rows = 0;
1004  int last_row_full = 0;
1005  for (int i = s.row_start; i < s.row_end; i++) {
1006  if (s.IsFullRow(i)) {
1007  num_full_rows++;
1008  if (i == s.row_end - 1) last_row_full++;
1009  }
1010  }
1011 
1012  if (num_full_rows < 0.7 * num_rows) {
1013  s.Fail(1, "Not enough full lines to know which lines start paras.");
1014  return;
1015  }
1016 
1017  // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
1018  s.eop_threshold = 0;
1019 
1020  if (s.ltr) {
1022  } else {
1024  }
1025 
1026  if (debug_level > 0) {
1027  tprintf("# Not enough variety for clear outline classification. "
1028  "Guessing these are %s aligned based on script.\n",
1029  s.ltr ? "left" : "right");
1030  s.PrintRows();
1031  }
1032 
1033  if (s.AlignTabs().size() == 2) { // case A1 or A2
1034  s.first_indent = s.AlignTabs()[1].center;
1035  s.body_indent = s.AlignTabs()[0].center;
1036  } else { // case B1 or B2
1037  if (num_rows - 1 == num_full_rows - last_row_full) {
1038  // case B2
1039  const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
1040  (*s.rows)[s.row_start].AddStartLine(model);
1041  for (int i = s.row_start + 1; i < s.row_end; i++) {
1042  (*s.rows)[i].AddBodyLine(model);
1043  }
1044  return;
1045  } else {
1046  // case B1
1047  s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1048  s.eop_threshold = (s.OffsideTabs()[0].center +
1049  s.OffsideTabs()[1].center) / 2;
1050  }
1051  }
1052  const ParagraphModel *model = theory->AddModel(s.Model());
1053  MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
1054  s.ltr, s.eop_threshold);
1055  return;
1056 }
1057 
1058 // This function is called if strong textual clues were not available, but
1059 // the caller hopes that the paragraph breaks will be super obvious just
1060 // by the outline of the text.
1061 //
1062 // The particularly difficult case is figuring out what's going on if you
1063 // don't have enough short paragraph end lines to tell us what's going on.
1064 //
1065 // For instance, let's say you have the following outline:
1066 //
1067 // (A1) xxxxxxxxxxxxxxxxxxxxxx
1068 // xxxxxxxxxxxxxxxxxxxx
1069 // xxxxxxxxxxxxxxxxxxxxxx
1070 // xxxxxxxxxxxxxxxxxxxxxx
1071 //
1072 // Even if we know that the text is left-to-right and so will probably be
1073 // left-aligned, both of the following are possible texts:
1074 //
1075 // (A1a) 1. Here our list item
1076 // with two full lines.
1077 // 2. Here a second item.
1078 // 3. Here our third one.
1079 //
1080 // (A1b) so ends paragraph one.
1081 // Here starts another
1082 // paragraph we want to
1083 // read. This continues
1084 //
1085 // These examples are obvious from the text and should have been caught
1086 // by the StrongEvidenceClassify pass. However, for languages where we don't
1087 // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
1088 // it's worth guessing that (A1b) is the correct interpretation if there are
1089 // far more "full" lines than "short" lines.
1090 static void GeometricClassify(int debug_level,
1092  int row_start, int row_end,
1093  ParagraphTheory *theory) {
1094  if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
1095  return;
1096  if (debug_level > 1) {
1097  tprintf("###############################################\n");
1098  tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n",
1099  row_start, row_end);
1100  tprintf("###############################################\n");
1101  }
1102  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
1103 
1104  GeometricClassifierState s(debug_level, rows, row_start, row_end);
1105  if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
1106  s.Fail(2, "Too much variety for simple outline classification.");
1107  return;
1108  }
1109  if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
1110  s.Fail(1, "Not enough variety for simple outline classification.");
1111  return;
1112  }
1113  if (s.left_tabs.size() + s.right_tabs.size() == 3) {
1114  GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1115  return;
1116  }
1117 
1118  // At this point, we know that one side has at least two tab stops, and the
1119  // other side has one or two tab stops.
1120  // Left to determine:
1121  // (1) Which is the body indent and which is the first line indent?
1122  // (2) Is the text fully justified?
1123 
1124  // If one side happens to have three or more tab stops, assume that side
1125  // is opposite of the aligned side.
1126  if (s.right_tabs.size() > 2) {
1128  } else if (s.left_tabs.size() > 2) {
1130  } else if (s.ltr) { // guess based on script direction
1132  } else {
1134  }
1135 
1136  if (s.AlignTabs().size() == 2) {
1137  // For each tab stop on the aligned side, how many of them appear
1138  // to be paragraph start lines? [first lines]
1139  int firsts[2] = {0, 0};
1140  // Count the first line as a likely paragraph start line.
1141  firsts[s.AlignsideTabIndex(s.row_start)]++;
1142  // For each line, if the first word would have fit on the previous
1143  // line count it as a likely paragraph start line.
1144  bool jam_packed = true;
1145  for (int i = s.row_start + 1; i < s.row_end; i++) {
1146  if (s.FirstWordWouldHaveFit(i - 1, i)) {
1147  firsts[s.AlignsideTabIndex(i)]++;
1148  jam_packed = false;
1149  }
1150  }
1151  // Make an extra accounting for the last line of the paragraph just
1152  // in case it's the only short line in the block. That is, take its
1153  // first word as typical and see if this looks like the *last* line
1154  // of a paragraph. If so, mark the *other* indent as probably a first.
1155  if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
1156  firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
1157  }
1158 
1159  int percent0firsts, percent1firsts;
1160  percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
1161  percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
1162 
1163  // TODO(eger): Tune these constants if necessary.
1164  if ((percent0firsts < 20 && 30 < percent1firsts) ||
1165  percent0firsts + 30 < percent1firsts) {
1166  s.first_indent = s.AlignTabs()[1].center;
1167  s.body_indent = s.AlignTabs()[0].center;
1168  } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1169  percent1firsts + 30 < percent0firsts) {
1170  s.first_indent = s.AlignTabs()[0].center;
1171  s.body_indent = s.AlignTabs()[1].center;
1172  } else {
1173  // Ambiguous! Probably lineated (poetry)
1174  if (debug_level > 1) {
1175  tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
1176  s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
1177  tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1178  s.AlignTabs()[0].center, percent0firsts);
1179  tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1180  s.AlignTabs()[1].center, percent1firsts);
1181  s.PrintRows();
1182  }
1183  return;
1184  }
1185  } else {
1186  // There's only one tab stop for the "aligned to" side.
1187  s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1188  }
1189 
1190  // At this point, we have our model.
1191  const ParagraphModel *model = theory->AddModel(s.Model());
1192 
1193  // Now all we have to do is figure out if the text is fully justified or not.
1194  // eop_threshold: default to fully justified unless we see evidence below.
1195  // See description on MarkRowsWithModel()
1196  s.eop_threshold =
1197  (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1198  // If the text is not fully justified, re-set the eop_threshold to 0.
1199  if (s.AlignTabs().size() == 2) {
1200  // Paragraphs with a paragraph-start indent.
1201  for (int i = s.row_start; i < s.row_end - 1; i++) {
1202  if (ValidFirstLine(s.rows, i + 1, model) &&
1203  !NearlyEqual(s.OffsideTabs()[0].center,
1204  (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1205  // We found a non-end-of-paragraph short line: not fully justified.
1206  s.eop_threshold = 0;
1207  break;
1208  }
1209  }
1210  } else {
1211  // Paragraphs with no paragraph-start indent.
1212  for (int i = s.row_start; i < s.row_end - 1; i++) {
1213  if (!s.FirstWordWouldHaveFit(i, i + 1) &&
1214  !NearlyEqual(s.OffsideTabs()[0].center,
1215  (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1216  // We found a non-end-of-paragraph short line: not fully justified.
1217  s.eop_threshold = 0;
1218  break;
1219  }
1220  }
1221  }
1222  MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
1223 }
1224 
1225 // =============== Implementation of ParagraphTheory =====================
1226 
1228  for (int i = 0; i < models_->size(); i++) {
1229  if ((*models_)[i]->Comparable(model))
1230  return (*models_)[i];
1231  }
1232  ParagraphModel *m = new ParagraphModel(model);
1233  models_->push_back(m);
1234  models_we_added_.push_back_new(m);
1235  return m;
1236 }
1237 
1239  for (int i = models_->size() - 1; i >= 0; i--) {
1240  ParagraphModel *m = (*models_)[i];
1241  if (!used_models.contains(m) && models_we_added_.contains(m)) {
1242  models_->remove(i);
1243  models_we_added_.remove(models_we_added_.get_index(m));
1244  delete m;
1245  }
1246  }
1247 }
1248 
1249 // Examine rows[start, end) and try to determine if an existing non-centered
1250 // paragraph model would fit them perfectly. If so, return a pointer to it.
1251 // If not, return nullptr.
1253  const GenericVector<RowScratchRegisters> *rows, int start, int end) const {
1254  for (int m = 0; m < models_->size(); m++) {
1255  const ParagraphModel *model = (*models_)[m];
1256  if (model->justification() != JUSTIFICATION_CENTER &&
1257  RowsFitModel(rows, start, end, model))
1258  return model;
1259  }
1260  return nullptr;
1261 }
1262 
1264  for (int m = 0; m < models_->size(); m++) {
1265  const ParagraphModel *model = (*models_)[m];
1266  if (model->justification() != JUSTIFICATION_CENTER)
1267  models->push_back_new(model);
1268  }
1269 }
1270 
1271 int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
1272  for (int i = 0; i < models_->size(); i++) {
1273  if ((*models_)[i] == model)
1274  return i;
1275  }
1276  return -1;
1277 }
1278 
1280  int row, const ParagraphModel *model) {
1281  if (!StrongModel(model)) {
1282  tprintf("ValidFirstLine() should only be called with strong models!\n");
1283  }
1284  return StrongModel(model) &&
1285  model->ValidFirstLine(
1286  (*rows)[row].lmargin_, (*rows)[row].lindent_,
1287  (*rows)[row].rindent_, (*rows)[row].rmargin_);
1288 }
1289 
1291  int row, const ParagraphModel *model) {
1292  if (!StrongModel(model)) {
1293  tprintf("ValidBodyLine() should only be called with strong models!\n");
1294  }
1295  return StrongModel(model) &&
1296  model->ValidBodyLine(
1297  (*rows)[row].lmargin_, (*rows)[row].lindent_,
1298  (*rows)[row].rindent_, (*rows)[row].rmargin_);
1299 }
1300 
1302  int a, int b, const ParagraphModel *model) {
1303  if (model != kCrownRight && model != kCrownLeft) {
1304  tprintf("CrownCompatible() should only be called with crown models!\n");
1305  return false;
1306  }
1307  RowScratchRegisters &row_a = (*rows)[a];
1308  RowScratchRegisters &row_b = (*rows)[b];
1309  if (model == kCrownRight) {
1310  return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
1311  row_b.rindent_ + row_b.rmargin_,
1312  Epsilon(row_a.ri_->average_interword_space));
1313  }
1314  return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
1315  row_b.lindent_ + row_b.lmargin_,
1316  Epsilon(row_a.ri_->average_interword_space));
1317 }
1318 
1319 
1320 // =============== Implementation of ParagraphModelSmearer ====================
1321 
1324  int row_start, int row_end, ParagraphTheory *theory)
1325  : theory_(theory), rows_(rows), row_start_(row_start),
1326  row_end_(row_end) {
1327  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1328  row_start_ = 0;
1329  row_end_ = 0;
1330  return;
1331  }
1332  SetOfModels no_models;
1333  for (int row = row_start - 1; row <= row_end; row++) {
1334  open_models_.push_back(no_models);
1335  }
1336 }
1337 
1338 // see paragraphs_internal.h
1339 void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
1340  SetOfModels no_models;
1341  if (row_start < row_start_) row_start = row_start_;
1342  if (row_end > row_end_) row_end = row_end_;
1343 
1344  for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
1345  row++) {
1346  if ((*rows_)[row].ri_->num_words == 0) {
1347  OpenModels(row + 1) = no_models;
1348  } else {
1349  SetOfModels &opened = OpenModels(row);
1350  (*rows_)[row].StartHypotheses(&opened);
1351 
1352  // Which models survive the transition from row to row + 1?
1353  SetOfModels still_open;
1354  for (int m = 0; m < opened.size(); m++) {
1355  if (ValidFirstLine(rows_, row, opened[m]) ||
1356  ValidBodyLine(rows_, row, opened[m])) {
1357  // This is basic filtering; we check likely paragraph starty-ness down
1358  // below in Smear() -- you know, whether the first word would have fit
1359  // and such.
1360  still_open.push_back_new(opened[m]);
1361  }
1362  }
1363  OpenModels(row + 1) = still_open;
1364  }
1365  }
1366 }
1367 
1368 // see paragraphs_internal.h
1370  CalculateOpenModels(row_start_, row_end_);
1371 
1372  // For each row which we're unsure about (that is, it is LT_UNKNOWN or
1373  // we have multiple LT_START hypotheses), see if there's a model that
1374  // was recently used (an "open" model) which might model it well.
1375  for (int i = row_start_; i < row_end_; i++) {
1376  RowScratchRegisters &row = (*rows_)[i];
1377  if (row.ri_->num_words == 0)
1378  continue;
1379 
1380  // Step One:
1381  // Figure out if there are "open" models which are left-alined or
1382  // right-aligned. This is important for determining whether the
1383  // "first" word in a row would fit at the "end" of the previous row.
1384  bool left_align_open = false;
1385  bool right_align_open = false;
1386  for (int m = 0; m < OpenModels(i).size(); m++) {
1387  switch (OpenModels(i)[m]->justification()) {
1388  case JUSTIFICATION_LEFT: left_align_open = true; break;
1389  case JUSTIFICATION_RIGHT: right_align_open = true; break;
1390  default: left_align_open = right_align_open = true;
1391  }
1392  }
1393  // Step Two:
1394  // Use that knowledge to figure out if this row is likely to
1395  // start a paragraph.
1396  bool likely_start;
1397  if (i == 0) {
1398  likely_start = true;
1399  } else {
1400  if ((left_align_open && right_align_open) ||
1401  (!left_align_open && !right_align_open)) {
1402  likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1403  JUSTIFICATION_LEFT) ||
1404  LikelyParagraphStart((*rows_)[i - 1], row,
1406  } else if (left_align_open) {
1407  likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1409  } else {
1410  likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1412  }
1413  }
1414 
1415  // Step Three:
1416  // If this text line seems like an obvious first line of an
1417  // open model, or an obvious continuation of an existing
1418  // modelled paragraph, mark it up.
1419  if (likely_start) {
1420  // Add Start Hypotheses for all Open models that fit.
1421  for (int m = 0; m < OpenModels(i).size(); m++) {
1422  if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
1423  row.AddStartLine(OpenModels(i)[m]);
1424  }
1425  }
1426  } else {
1427  // Add relevant body line hypotheses.
1428  SetOfModels last_line_models;
1429  if (i > 0) {
1430  (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1431  } else {
1432  theory_->NonCenteredModels(&last_line_models);
1433  }
1434  for (int m = 0; m < last_line_models.size(); m++) {
1435  const ParagraphModel *model = last_line_models[m];
1436  if (ValidBodyLine(rows_, i, model))
1437  row.AddBodyLine(model);
1438  }
1439  }
1440 
1441  // Step Four:
1442  // If we're still quite unsure about this line, go through all
1443  // models in our theory and see if this row could be the start
1444  // of any of our models.
1445  if (row.GetLineType() == LT_UNKNOWN ||
1446  (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
1447  SetOfModels all_models;
1448  theory_->NonCenteredModels(&all_models);
1449  for (int m = 0; m < all_models.size(); m++) {
1450  if (ValidFirstLine(rows_, i, all_models[m])) {
1451  row.AddStartLine(all_models[m]);
1452  }
1453  }
1454  }
1455  // Step Five:
1456  // Since we may have updated the hypotheses about this row, we need
1457  // to recalculate the Open models for the rest of rows[i + 1, row_end)
1458  if (row.GetLineType() != LT_UNKNOWN) {
1459  CalculateOpenModels(i + 1, row_end_);
1460  }
1461  }
1462 }
1463 
1464 // ================ Main Paragraph Detection Algorithm =======================
1465 
1466 // Find out what ParagraphModels are actually used, and discard any
1467 // that are not.
1468 static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
1469  ParagraphTheory *theory) {
1470  SetOfModels used_models;
1471  for (int i = 0; i < rows.size(); i++) {
1472  rows[i].StrongHypotheses(&used_models);
1473  }
1474  theory->DiscardUnusedModels(used_models);
1475 }
1476 
1477 // DowngradeWeakestToCrowns:
1478 // Forget any flush-{left, right} models unless we see two or more
1479 // of them in sequence.
1480 //
1481 // In pass 3, we start to classify even flush-left paragraphs (paragraphs
1482 // where the first line and body indent are the same) as having proper Models.
1483 // This is generally dangerous, since if you start imagining that flush-left
1484 // is a typical paragraph model when it is not, it will lead you to chop normal
1485 // indented paragraphs in the middle whenever a sentence happens to start on a
1486 // new line (see "This" above). What to do?
1487 // What we do is to take any paragraph which is flush left and is not
1488 // preceded by another paragraph of the same model and convert it to a "Crown"
1489 // paragraph. This is a weak pseudo-ParagraphModel which is a placeholder
1490 // for later. It means that the paragraph is flush, but it would be desirable
1491 // to mark it as the same model as following text if it fits. This downgrade
1492 // FlushLeft -> CrownLeft -> Model of following paragraph. Means that we
1493 // avoid making flush left Paragraph Models whenever we see a top-of-the-page
1494 // half-of-a-paragraph. and instead we mark it the same as normal body text.
1495 //
1496 // Implementation:
1497 //
1498 // Comb backwards through the row scratch registers, and turn any
1499 // sequences of body lines of equivalent type abutted against the beginning
1500 // or a body or start line of a different type into a crown paragraph.
1501 static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
1503  int start;
1504  for (int end = rows->size(); end > 0; end = start) {
1505  // Search back for a body line of a unique type.
1506  const ParagraphModel *model = nullptr;
1507  while (end > 0 &&
1508  (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) {
1509  end--;
1510  }
1511  if (end == 0) break;
1512  start = end - 1;
1513  while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1514  start--; // walk back to the first line that is not the same body type.
1515  }
1516  if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
1517  StrongModel(model) &&
1518  NearlyEqual(model->first_indent(), model->body_indent(),
1519  model->tolerance())) {
1520  start--;
1521  }
1522  start++;
1523  // Now rows[start, end) is a sequence of unique body hypotheses of model.
1524  if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
1525  continue;
1526  if (!StrongModel(model)) {
1527  while (start > 0 &&
1528  CrownCompatible(rows, start - 1, start, model))
1529  start--;
1530  }
1531  if (start == 0 ||
1532  (!StrongModel(model)) ||
1533  (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
1534  // crownify rows[start, end)
1535  const ParagraphModel *crown_model = model;
1536  if (StrongModel(model)) {
1537  if (model->justification() == JUSTIFICATION_LEFT)
1538  crown_model = kCrownLeft;
1539  else
1540  crown_model = kCrownRight;
1541  }
1542  (*rows)[start].SetUnknown();
1543  (*rows)[start].AddStartLine(crown_model);
1544  for (int row = start + 1; row < end; row++) {
1545  (*rows)[row].SetUnknown();
1546  (*rows)[row].AddBodyLine(crown_model);
1547  }
1548  }
1549  }
1550  DiscardUnusedModels(*rows, theory);
1551 }
1552 
1553 
1554 // Clear all hypotheses about lines [start, end) and reset margins.
1555 //
1556 // The empty space between the left of a row and the block boundary (and
1557 // similarly for the right) is split into two pieces: margin and indent.
1558 // In initial processing, we assume the block is tight and the margin for
1559 // all lines is set to zero. However, if our first pass does not yield
1560 // models for everything, it may be due to an inset paragraph like a
1561 // block-quote. In that case, we make a second pass over that unmarked
1562 // section of the page and reset the "margin" portion of the empty space
1563 // to the common amount of space at the ends of the lines under consid-
1564 // eration. This would be equivalent to percentile set to 0. However,
1565 // sometimes we have a single character sticking out in the right margin
1566 // of a text block (like the 'r' in 'for' on line 3 above), and we can
1567 // really just ignore it as an outlier. To express this, we allow the
1568 // user to specify the percentile (0..100) of indent values to use as
1569 // the common margin for each row in the run of rows[start, end).
1571  GenericVector<RowScratchRegisters> *rows, int start, int end,
1572  int percentile) {
1573  if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
1574  return;
1575 
1576  int lmin, lmax, rmin, rmax;
1577  lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1578  rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1579  for (int i = start; i < end; i++) {
1580  RowScratchRegisters &sr = (*rows)[i];
1581  sr.SetUnknown();
1582  if (sr.ri_->num_words == 0)
1583  continue;
1584  UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
1585  UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
1586  }
1587  STATS lefts(lmin, lmax + 1);
1588  STATS rights(rmin, rmax + 1);
1589  for (int i = start; i < end; i++) {
1590  RowScratchRegisters &sr = (*rows)[i];
1591  if (sr.ri_->num_words == 0)
1592  continue;
1593  lefts.add(sr.lmargin_ + sr.lindent_, 1);
1594  rights.add(sr.rmargin_ + sr.rindent_, 1);
1595  }
1596  int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
1597  int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
1598  for (int i = start; i < end; i++) {
1599  RowScratchRegisters &sr = (*rows)[i];
1600  int ldelta = ignorable_left - sr.lmargin_;
1601  sr.lmargin_ += ldelta;
1602  sr.lindent_ -= ldelta;
1603  int rdelta = ignorable_right - sr.rmargin_;
1604  sr.rmargin_ += rdelta;
1605  sr.rindent_ -= rdelta;
1606  }
1607 }
1608 
1609 // Return the median inter-word space in rows[row_start, row_end).
1611  int row_start, int row_end) {
1612  if (row_end < row_start + 1) return 1;
1613  int word_height = (rows[row_start].ri_->lword_box.height() +
1614  rows[row_end - 1].ri_->lword_box.height()) / 2;
1615  int word_width = (rows[row_start].ri_->lword_box.width() +
1616  rows[row_end - 1].ri_->lword_box.width()) / 2;
1617  STATS spacing_widths(0, 5 + word_width);
1618  for (int i = row_start; i < row_end; i++) {
1619  if (rows[i].ri_->num_words > 1) {
1620  spacing_widths.add(rows[i].ri_->average_interword_space, 1);
1621  }
1622  }
1623  int minimum_reasonable_space = word_height / 3;
1624  if (minimum_reasonable_space < 2)
1625  minimum_reasonable_space = 2;
1626  int median = spacing_widths.median();
1627  return (median > minimum_reasonable_space)
1628  ? median : minimum_reasonable_space;
1629 }
1630 
1631 // Return whether the first word on the after line can fit in the space at
1632 // the end of the before line (knowing which way the text is aligned and read).
1634  const RowScratchRegisters &after,
1635  tesseract::ParagraphJustification justification) {
1636  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1637  return true;
1638 
1639  if (justification == JUSTIFICATION_UNKNOWN) {
1640  tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1641  }
1642  int available_space;
1643  if (justification == JUSTIFICATION_CENTER) {
1644  available_space = before.lindent_ + before.rindent_;
1645  } else {
1646  available_space = before.OffsideIndent(justification);
1647  }
1648  available_space -= before.ri_->average_interword_space;
1649 
1650  if (before.ri_->ltr)
1651  return after.ri_->lword_box.width() < available_space;
1652  return after.ri_->rword_box.width() < available_space;
1653 }
1654 
1655 // Return whether the first word on the after line can fit in the space at
1656 // the end of the before line (not knowing which way the text goes) in a left
1657 // or right alignment.
1659  const RowScratchRegisters &after) {
1660  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1661  return true;
1662 
1663  int available_space = before.lindent_;
1664  if (before.rindent_ > available_space)
1665  available_space = before.rindent_;
1666  available_space -= before.ri_->average_interword_space;
1667 
1668  if (before.ri_->ltr)
1669  return after.ri_->lword_box.width() < available_space;
1670  return after.ri_->rword_box.width() < available_space;
1671 }
1672 
1673 static bool TextSupportsBreak(const RowScratchRegisters &before,
1674  const RowScratchRegisters &after) {
1675  if (before.ri_->ltr) {
1676  return before.ri_->rword_likely_ends_idea &&
1677  after.ri_->lword_likely_starts_idea;
1678  } else {
1679  return before.ri_->lword_likely_ends_idea &&
1680  after.ri_->rword_likely_starts_idea;
1681  }
1682 }
1683 
1684 static bool LikelyParagraphStart(const RowScratchRegisters &before,
1685  const RowScratchRegisters &after,
1687  return before.ri_->num_words == 0 ||
1688  (FirstWordWouldHaveFit(before, after, j) &&
1689  TextSupportsBreak(before, after));
1690 }
1691 
1692 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1693 // would fit them as a single paragraph.
1694 // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
1695 // If the rows given could be a consistent start to a paragraph, set *consistent
1696 // true.
1697 static ParagraphModel InternalParagraphModelByOutline(
1699  int start, int end, int tolerance, bool *consistent) {
1700  int ltr_line_count = 0;
1701  for (int i = start; i < end; i++) {
1702  ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
1703  }
1704  bool ltr = (ltr_line_count >= (end - start) / 2);
1705 
1706  *consistent = true;
1707  if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
1708  return ParagraphModel();
1709 
1710  // Ensure the caller only passed us a region with a common rmargin and
1711  // lmargin.
1712  int lmargin = (*rows)[start].lmargin_;
1713  int rmargin = (*rows)[start].rmargin_;
1714  int lmin, lmax, rmin, rmax, cmin, cmax;
1715  lmin = lmax = (*rows)[start + 1].lindent_;
1716  rmin = rmax = (*rows)[start + 1].rindent_;
1717  cmin = cmax = 0;
1718  for (int i = start + 1; i < end; i++) {
1719  if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1720  tprintf("Margins don't match! Software error.\n");
1721  *consistent = false;
1722  return ParagraphModel();
1723  }
1724  UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
1725  UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
1726  UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1727  }
1728  int ldiff = lmax - lmin;
1729  int rdiff = rmax - rmin;
1730  int cdiff = cmax - cmin;
1731  if (rdiff > tolerance && ldiff > tolerance) {
1732  if (cdiff < tolerance * 2) {
1733  if (end - start < 3)
1734  return ParagraphModel();
1735  return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
1736  }
1737  *consistent = false;
1738  return ParagraphModel();
1739  }
1740  if (end - start < 3) // Don't return a model for two line paras.
1741  return ParagraphModel();
1742 
1743  // These booleans keep us from saying something is aligned left when the body
1744  // left variance is too large.
1745  bool body_admits_left_alignment = ldiff < tolerance;
1746  bool body_admits_right_alignment = rdiff < tolerance;
1747 
1748  ParagraphModel left_model =
1749  ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
1750  (lmin + lmax) / 2, tolerance);
1751  ParagraphModel right_model =
1752  ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
1753  (rmin + rmax) / 2, tolerance);
1754 
1755  // These booleans keep us from having an indent on the "wrong side" for the
1756  // first line.
1757  bool text_admits_left_alignment = ltr || left_model.is_flush();
1758  bool text_admits_right_alignment = !ltr || right_model.is_flush();
1759 
1760  // At least one of the edges is less than tolerance in variance.
1761  // If the other is obviously ragged, it can't be the one aligned to.
1762  // [Note the last line is included in this raggedness.]
1763  if (tolerance < rdiff) {
1764  if (body_admits_left_alignment && text_admits_left_alignment)
1765  return left_model;
1766  *consistent = false;
1767  return ParagraphModel();
1768  }
1769  if (tolerance < ldiff) {
1770  if (body_admits_right_alignment && text_admits_right_alignment)
1771  return right_model;
1772  *consistent = false;
1773  return ParagraphModel();
1774  }
1775 
1776  // At this point, we know the body text doesn't vary much on either side.
1777 
1778  // If the first line juts out oddly in one direction or the other,
1779  // that likely indicates the side aligned to.
1780  int first_left = (*rows)[start].lindent_;
1781  int first_right = (*rows)[start].rindent_;
1782 
1783  if (ltr && body_admits_left_alignment &&
1784  (first_left < lmin || first_left > lmax))
1785  return left_model;
1786  if (!ltr && body_admits_right_alignment &&
1787  (first_right < rmin || first_right > rmax))
1788  return right_model;
1789 
1790  *consistent = false;
1791  return ParagraphModel();
1792 }
1793 
1794 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1795 // would fit them as a single paragraph. If nothing fits,
1796 // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
1797 // output if we're debugging.
1798 static ParagraphModel ParagraphModelByOutline(
1799  int debug_level,
1801  int start, int end, int tolerance) {
1802  bool unused_consistent;
1803  ParagraphModel retval = InternalParagraphModelByOutline(
1804  rows, start, end, tolerance, &unused_consistent);
1805  if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
1806  tprintf("Could not determine a model for this paragraph:\n");
1807  PrintRowRange(*rows, start, end);
1808  }
1809  return retval;
1810 }
1811 
1812 // Do rows[start, end) form a single instance of the given paragraph model?
1814  int start, int end, const ParagraphModel *model) {
1815  if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
1816  return false;
1817  if (!ValidFirstLine(rows, start, model)) return false;
1818  for (int i = start + 1 ; i < end; i++) {
1819  if (!ValidBodyLine(rows, i, model)) return false;
1820  }
1821  return true;
1822 }
1823 
1824 // Examine rows[row_start, row_end) as an independent section of text,
1825 // and mark rows that are exceptionally clear as start-of-paragraph
1826 // and paragraph-body lines.
1827 //
1828 // We presume that any lines surrounding rows[row_start, row_end) may
1829 // have wildly different paragraph models, so we don't key any data off
1830 // of those lines.
1831 //
1832 // We only take the very strongest signals, as we don't want to get
1833 // confused and marking up centered text, poetry, or source code as
1834 // clearly part of a typical paragraph.
1835 static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows,
1836  int row_start, int row_end) {
1837  // Record patently obvious body text.
1838  for (int i = row_start + 1; i < row_end; i++) {
1839  const RowScratchRegisters &prev = (*rows)[i - 1];
1840  RowScratchRegisters &curr = (*rows)[i];
1841  tesseract::ParagraphJustification typical_justification =
1842  prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1843  if (!curr.ri_->rword_likely_starts_idea &&
1844  !curr.ri_->lword_likely_starts_idea &&
1845  !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
1846  curr.SetBodyLine();
1847  }
1848  }
1849 
1850  // Record patently obvious start paragraph lines.
1851  //
1852  // It's an extremely good signal of the start of a paragraph that
1853  // the first word would have fit on the end of the previous line.
1854  // However, applying just that signal would have us mark random
1855  // start lines of lineated text (poetry and source code) and some
1856  // centered headings as paragraph start lines. Therefore, we use
1857  // a second qualification for a paragraph start: Not only should
1858  // the first word of this line have fit on the previous line,
1859  // but also, this line should go full to the right of the block,
1860  // disallowing a subsequent word from having fit on this line.
1861 
1862  // First row:
1863  {
1864  RowScratchRegisters &curr = (*rows)[row_start];
1865  RowScratchRegisters &next = (*rows)[row_start + 1];
1867  curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1868  if (curr.GetLineType() == LT_UNKNOWN &&
1869  !FirstWordWouldHaveFit(curr, next, j) &&
1870  (curr.ri_->lword_likely_starts_idea ||
1871  curr.ri_->rword_likely_starts_idea)) {
1872  curr.SetStartLine();
1873  }
1874  }
1875  // Middle rows
1876  for (int i = row_start + 1; i < row_end - 1; i++) {
1877  RowScratchRegisters &prev = (*rows)[i - 1];
1878  RowScratchRegisters &curr = (*rows)[i];
1879  RowScratchRegisters &next = (*rows)[i + 1];
1881  curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1882  if (curr.GetLineType() == LT_UNKNOWN &&
1883  !FirstWordWouldHaveFit(curr, next, j) &&
1884  LikelyParagraphStart(prev, curr, j)) {
1885  curr.SetStartLine();
1886  }
1887  }
1888  // Last row
1889  { // the short circuit at the top means we have at least two lines.
1890  RowScratchRegisters &prev = (*rows)[row_end - 2];
1891  RowScratchRegisters &curr = (*rows)[row_end - 1];
1893  curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1894  if (curr.GetLineType() == LT_UNKNOWN &&
1895  !FirstWordWouldHaveFit(curr, curr, j) &&
1896  LikelyParagraphStart(prev, curr, j)) {
1897  curr.SetStartLine();
1898  }
1899  }
1900 }
1901 
1902 // Look for sequences of a start line followed by some body lines in
1903 // rows[row_start, row_end) and create ParagraphModels for them if
1904 // they seem coherent.
1905 static void ModelStrongEvidence(int debug_level,
1907  int row_start, int row_end,
1908  bool allow_flush_models,
1909  ParagraphTheory *theory) {
1910  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1911  return;
1912 
1913  int start = row_start;
1914  while (start < row_end) {
1915  while (start < row_end && (*rows)[start].GetLineType() != LT_START)
1916  start++;
1917  if (start >= row_end - 1)
1918  break;
1919 
1920  int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1921  int end = start;
1922  ParagraphModel last_model;
1923  bool next_consistent;
1924  do {
1925  ++end;
1926  // rows[row, end) was consistent.
1927  // If rows[row, end + 1) is not consistent,
1928  // just model rows[row, end)
1929  if (end < row_end - 1) {
1930  RowScratchRegisters &next = (*rows)[end];
1931  LineType lt = next.GetLineType();
1932  next_consistent = lt == LT_BODY ||
1933  (lt == LT_UNKNOWN &&
1934  !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
1935  } else {
1936  next_consistent = false;
1937  }
1938  if (next_consistent) {
1939  ParagraphModel next_model = InternalParagraphModelByOutline(
1940  rows, start, end + 1, tolerance, &next_consistent);
1941  if (((*rows)[start].ri_->ltr &&
1942  last_model.justification() == JUSTIFICATION_LEFT &&
1943  next_model.justification() != JUSTIFICATION_LEFT) ||
1944  (!(*rows)[start].ri_->ltr &&
1945  last_model.justification() == JUSTIFICATION_RIGHT &&
1946  next_model.justification() != JUSTIFICATION_RIGHT)) {
1947  next_consistent = false;
1948  }
1949  last_model = next_model;
1950  } else {
1951  next_consistent = false;
1952  }
1953  } while (next_consistent && end < row_end);
1954  // At this point, rows[start, end) looked like it could have been a
1955  // single paragraph. If we can make a good ParagraphModel for it,
1956  // do so and mark this sequence with that model.
1957  if (end > start + 1) {
1958  // emit a new paragraph if we have more than one line.
1959  const ParagraphModel *model = nullptr;
1960  ParagraphModel new_model = ParagraphModelByOutline(
1961  debug_level, rows, start, end,
1962  Epsilon(InterwordSpace(*rows, start, end)));
1963  if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
1964  // couldn't create a good model, oh well.
1965  } else if (new_model.is_flush()) {
1966  if (end == start + 2) {
1967  // It's very likely we just got two paragraph starts in a row.
1968  end = start + 1;
1969  } else if (start == row_start) {
1970  // Mark this as a Crown.
1971  if (new_model.justification() == JUSTIFICATION_LEFT) {
1972  model = kCrownLeft;
1973  } else {
1974  model = kCrownRight;
1975  }
1976  } else if (allow_flush_models) {
1977  model = theory->AddModel(new_model);
1978  }
1979  } else {
1980  model = theory->AddModel(new_model);
1981  }
1982  if (model) {
1983  (*rows)[start].AddStartLine(model);
1984  for (int i = start + 1; i < end; i++) {
1985  (*rows)[i].AddBodyLine(model);
1986  }
1987  }
1988  }
1989  start = end;
1990  }
1991 }
1992 
1993 // We examine rows[row_start, row_end) and do the following:
1994 // (1) Clear all existing hypotheses for the rows being considered.
1995 // (2) Mark up any rows as exceptionally likely to be paragraph starts
1996 // or paragraph body lines as such using both geometric and textual
1997 // clues.
1998 // (3) Form models for any sequence of start + continuation lines.
1999 // (4) Smear the paragraph models to cover surrounding text.
2000 static void StrongEvidenceClassify(int debug_level,
2002  int row_start, int row_end,
2003  ParagraphTheory *theory) {
2004  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
2005  return;
2006 
2007  if (debug_level > 1) {
2008  tprintf("#############################################\n");
2009  tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2010  tprintf("#############################################\n");
2011  }
2012 
2013  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
2014  MarkStrongEvidence(rows, row_start, row_end);
2015 
2016  DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
2017 
2018  // Create paragraph models.
2019  ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
2020 
2021  DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
2022 
2023  // At this point, some rows are marked up as paragraphs with model numbers,
2024  // and some rows are marked up as either LT_START or LT_BODY. Now let's
2025  // smear any good paragraph hypotheses forward and backward.
2026  ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2027  smearer.Smear();
2028 }
2029 
2030 static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,
2031  int row_start, int row_end,
2032  ParagraphTheory *theory) {
2033  for (int i = row_start + 1; i < row_end - 1; i++) {
2034  if ((*rows)[i - 1].ri_->has_leaders &&
2035  (*rows)[i].ri_->has_leaders &&
2036  (*rows)[i + 1].ri_->has_leaders) {
2037  const ParagraphModel *model = theory->AddModel(
2038  ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
2039  (*rows)[i].AddStartLine(model);
2040  }
2041  }
2042 }
2043 
2044 // Collect sequences of unique hypotheses in row registers and create proper
2045 // paragraphs for them, referencing the paragraphs in row_owners.
2046 static void ConvertHypothesizedModelRunsToParagraphs(
2047  int debug_level,
2049  GenericVector<PARA *> *row_owners,
2050  ParagraphTheory *theory) {
2051  int end = rows.size();
2052  int start;
2053  for (; end > 0; end = start) {
2054  start = end - 1;
2055  const ParagraphModel *model = nullptr;
2056  // TODO(eger): Be smarter about dealing with multiple hypotheses.
2057  bool single_line_paragraph = false;
2058  SetOfModels models;
2059  rows[start].NonNullHypotheses(&models);
2060  if (!models.empty()) {
2061  model = models[0];
2062  if (rows[start].GetLineType(model) != LT_BODY)
2063  single_line_paragraph = true;
2064  }
2065  if (model && !single_line_paragraph) {
2066  // walk back looking for more body lines and then a start line.
2067  while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
2068  // do nothing
2069  }
2070  if (start < 0 || rows[start].GetLineType(model) != LT_START) {
2071  model = nullptr;
2072  }
2073  }
2074  if (model == nullptr) {
2075  continue;
2076  }
2077  // rows[start, end) should be a paragraph.
2078  PARA *p = new PARA();
2079  if (model == kCrownLeft || model == kCrownRight) {
2081  // Crown paragraph.
2082  // If we can find an existing ParagraphModel that fits, use it,
2083  // else create a new one.
2084  for (int row = end; row < rows.size(); row++) {
2085  if ((*row_owners)[row] &&
2086  (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
2087  (start == 0 ||
2088  ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
2089  model = (*row_owners)[row]->model;
2090  break;
2091  }
2092  }
2093  if (model == kCrownLeft) {
2094  // No subsequent model fits, so cons one up.
2095  model = theory->AddModel(ParagraphModel(
2096  JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
2097  0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2098  } else if (model == kCrownRight) {
2099  // No subsequent model fits, so cons one up.
2100  model = theory->AddModel(ParagraphModel(
2101  JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
2102  0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2103  }
2104  }
2105  rows[start].SetUnknown();
2106  rows[start].AddStartLine(model);
2107  for (int i = start + 1; i < end; i++) {
2108  rows[i].SetUnknown();
2109  rows[i].AddBodyLine(model);
2110  }
2111  p->model = model;
2112  p->has_drop_cap = rows[start].ri_->has_drop_cap;
2113  p->is_list_item =
2115  ? rows[start].ri_->rword_indicates_list_item
2116  : rows[start].ri_->lword_indicates_list_item;
2117  for (int row = start; row < end; row++) {
2118  if ((*row_owners)[row] != nullptr) {
2119  tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2120  "more than once!\n");
2121  delete (*row_owners)[row];
2122  }
2123  (*row_owners)[row] = p;
2124  }
2125  }
2126 }
2127 
2128 struct Interval {
2129  Interval() : begin(0), end(0) {}
2130  Interval(int b, int e) : begin(b), end(e) {}
2131 
2132  int begin;
2133  int end;
2134 };
2135 
2136 // Return whether rows[row] appears to be stranded, meaning that the evidence
2137 // for this row is very weak due to context. For instance, two lines of source
2138 // code may happen to be indented at the same tab vector as body text starts,
2139 // leading us to think they are two start-of-paragraph lines. This is not
2140 // optimal. However, we also don't want to mark a sequence of short dialog
2141 // as "weak," so our heuristic is:
2142 // (1) If a line is surrounded by lines of unknown type, it's weak.
2143 // (2) If two lines in a row are start lines for a given paragraph type, but
2144 // after that the same paragraph type does not continue, they're weak.
2145 static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows,
2146  int row) {
2147  SetOfModels row_models;
2148  rows[row].StrongHypotheses(&row_models);
2149 
2150  for (int m = 0; m < row_models.size(); m++) {
2151  bool all_starts = rows[row].GetLineType();
2152  int run_length = 1;
2153  bool continues = true;
2154  for (int i = row - 1; i >= 0 && continues; i--) {
2155  SetOfModels models;
2156  rows[i].NonNullHypotheses(&models);
2157  switch (rows[i].GetLineType(row_models[m])) {
2158  case LT_START: run_length++; break;
2159  case LT_MULTIPLE: // explicit fall-through
2160  case LT_BODY: run_length++; all_starts = false; break;
2161  case LT_UNKNOWN: // explicit fall-through
2162  default: continues = false;
2163  }
2164  }
2165  continues = true;
2166  for (int i = row + 1; i < rows.size() && continues; i++) {
2167  SetOfModels models;
2168  rows[i].NonNullHypotheses(&models);
2169  switch (rows[i].GetLineType(row_models[m])) {
2170  case LT_START: run_length++; break;
2171  case LT_MULTIPLE: // explicit fall-through
2172  case LT_BODY: run_length++; all_starts = false; break;
2173  case LT_UNKNOWN: // explicit fall-through
2174  default: continues = false;
2175  }
2176  }
2177  if (run_length > 2 || (!all_starts && run_length > 1)) return false;
2178  }
2179  return true;
2180 }
2181 
2182 // Go through rows[row_start, row_end) and gather up sequences that need better
2183 // classification.
2184 // + Sequences of non-empty rows without hypotheses.
2185 // + Crown paragraphs not immediately followed by a strongly modeled line.
2186 // + Single line paragraphs surrounded by text that doesn't match the
2187 // model.
2188 static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
2189  GenericVector<Interval> *to_fix,
2190  int row_start, int row_end) {
2191  to_fix->clear();
2192  for (int i = row_start; i < row_end; i++) {
2193  bool needs_fixing = false;
2194 
2195  SetOfModels models;
2196  SetOfModels models_w_crowns;
2197  rows[i].StrongHypotheses(&models);
2198  rows[i].NonNullHypotheses(&models_w_crowns);
2199  if (models.empty() && !models_w_crowns.empty()) {
2200  // Crown paragraph. Is it followed by a modeled line?
2201  for (int end = i + 1; end < rows.size(); end++) {
2202  SetOfModels end_models;
2203  SetOfModels strong_end_models;
2204  rows[end].NonNullHypotheses(&end_models);
2205  rows[end].StrongHypotheses(&strong_end_models);
2206  if (end_models.empty()) {
2207  needs_fixing = true;
2208  break;
2209  } else if (!strong_end_models.empty()) {
2210  needs_fixing = false;
2211  break;
2212  }
2213  }
2214  } else if (models.empty() && rows[i].ri_->num_words > 0) {
2215  // No models at all.
2216  needs_fixing = true;
2217  }
2218 
2219  if (!needs_fixing && !models.empty()) {
2220  needs_fixing = RowIsStranded(rows, i);
2221  }
2222 
2223  if (needs_fixing) {
2224  if (!to_fix->empty() && to_fix->back().end == i - 1)
2225  to_fix->back().end = i;
2226  else
2227  to_fix->push_back(Interval(i, i));
2228  }
2229  }
2230  // Convert inclusive intervals to half-open intervals.
2231  for (int i = 0; i < to_fix->size(); i++) {
2232  (*to_fix)[i].end = (*to_fix)[i].end + 1;
2233  }
2234 }
2235 
2236 // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
2237 // normalize each row_owner to point to an actual PARA, and output the
2238 // paragraphs in order onto paragraphs.
2240  GenericVector<PARA *> *row_owners,
2241  PARA_LIST *paragraphs) {
2242  GenericVector<PARA *> &rows = *row_owners;
2243  paragraphs->clear();
2244  PARA_IT out(paragraphs);
2245  PARA *formerly_null = nullptr;
2246  for (int i = 0; i < rows.size(); i++) {
2247  if (rows[i] == nullptr) {
2248  if (i == 0 || rows[i - 1] != formerly_null) {
2249  rows[i] = formerly_null = new PARA();
2250  } else {
2251  rows[i] = formerly_null;
2252  continue;
2253  }
2254  } else if (i > 0 && rows[i - 1] == rows[i]) {
2255  continue;
2256  }
2257  out.add_after_then_move(rows[i]);
2258  }
2259 }
2260 
2261 // Main entry point for Paragraph Detection Algorithm.
2262 //
2263 // Given a set of equally spaced textlines (described by row_infos),
2264 // Split them into paragraphs.
2265 //
2266 // Output:
2267 // row_owners - one pointer for each row, to the paragraph it belongs to.
2268 // paragraphs - this is the actual list of PARA objects.
2269 // models - the list of paragraph models referenced by the PARA objects.
2270 // caller is responsible for deleting the models.
2271 void DetectParagraphs(int debug_level,
2272  GenericVector<RowInfo> *row_infos,
2273  GenericVector<PARA *> *row_owners,
2274  PARA_LIST *paragraphs,
2277  ParagraphTheory theory(models);
2278 
2279  // Initialize row_owners to be a bunch of nullptr pointers.
2280  row_owners->init_to_size(row_infos->size(), nullptr);
2281 
2282  // Set up row scratch registers for the main algorithm.
2283  rows.init_to_size(row_infos->size(), RowScratchRegisters());
2284  for (int i = 0; i < row_infos->size(); i++) {
2285  rows[i].Init((*row_infos)[i]);
2286  }
2287 
2288  // Pass 1:
2289  // Detect sequences of lines that all contain leader dots (.....)
2290  // These are likely Tables of Contents. If there are three text lines in
2291  // a row with leader dots, it's pretty safe to say the middle one should
2292  // be a paragraph of its own.
2293  SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
2294 
2295  DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
2296 
2297  GenericVector<Interval> leftovers;
2298  LeftoverSegments(rows, &leftovers, 0, rows.size());
2299  for (int i = 0; i < leftovers.size(); i++) {
2300  // Pass 2a:
2301  // Find any strongly evidenced start-of-paragraph lines. If they're
2302  // followed by two lines that look like body lines, make a paragraph
2303  // model for that and see if that model applies throughout the text
2304  // (that is, "smear" it).
2305  StrongEvidenceClassify(debug_level, &rows,
2306  leftovers[i].begin, leftovers[i].end, &theory);
2307 
2308  // Pass 2b:
2309  // If we had any luck in pass 2a, we got part of the page and didn't
2310  // know how to classify a few runs of rows. Take the segments that
2311  // didn't find a model and reprocess them individually.
2312  GenericVector<Interval> leftovers2;
2313  LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
2314  bool pass2a_was_useful = leftovers2.size() > 1 ||
2315  (leftovers2.size() == 1 &&
2316  (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
2317  if (pass2a_was_useful) {
2318  for (int j = 0; j < leftovers2.size(); j++) {
2319  StrongEvidenceClassify(debug_level, &rows,
2320  leftovers2[j].begin, leftovers2[j].end,
2321  &theory);
2322  }
2323  }
2324  }
2325 
2326  DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
2327 
2328  // Pass 3:
2329  // These are the dregs for which we didn't have enough strong textual
2330  // and geometric clues to form matching models for. Let's see if
2331  // the geometric clues are simple enough that we could just use those.
2332  LeftoverSegments(rows, &leftovers, 0, rows.size());
2333  for (int i = 0; i < leftovers.size(); i++) {
2334  GeometricClassify(debug_level, &rows,
2335  leftovers[i].begin, leftovers[i].end, &theory);
2336  }
2337 
2338  // Undo any flush models for which there's little evidence.
2339  DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2340 
2341  DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
2342 
2343  // Pass 4:
2344  // Take everything that's still not marked up well and clear all markings.
2345  LeftoverSegments(rows, &leftovers, 0, rows.size());
2346  for (int i = 0; i < leftovers.size(); i++) {
2347  for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
2348  rows[j].SetUnknown();
2349  }
2350  }
2351 
2352  DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
2353 
2354  // Convert all of the unique hypothesis runs to PARAs.
2355  ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
2356  &theory);
2357 
2358  DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
2359 
2360  // Finally, clean up any dangling nullptr row paragraph parents.
2361  CanonicalizeDetectionResults(row_owners, paragraphs);
2362 }
2363 
2364 // ============ Code interfacing with the rest of Tesseract ==================
2365 
2366 static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it,
2367  RowInfo *info) {
2368  // Set up text, lword_text, and rword_text (mostly for debug printing).
2369  STRING fake_text;
2370  PageIterator pit(static_cast<const PageIterator&>(it));
2371  bool first_word = true;
2372  if (!pit.Empty(RIL_WORD)) {
2373  do {
2374  fake_text += "x";
2375  if (first_word) info->lword_text += "x";
2376  info->rword_text += "x";
2377  if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
2378  !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
2379  fake_text += " ";
2380  info->rword_text = "";
2381  first_word = false;
2382  }
2383  } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&
2384  pit.Next(RIL_SYMBOL));
2385  }
2386  if (fake_text.size() == 0) return;
2387 
2388  int lspaces = info->pix_ldistance / info->average_interword_space;
2389  for (int i = 0; i < lspaces; i++) {
2390  info->text += ' ';
2391  }
2392  info->text += fake_text;
2393 
2394  // Set up lword_box, rword_box, and num_words.
2395  PAGE_RES_IT page_res_it = *it.PageResIt();
2396  WERD_RES *word_res = page_res_it.restart_row();
2397  ROW_RES *this_row = page_res_it.row();
2398 
2399  WERD_RES *lword = nullptr;
2400  WERD_RES *rword = nullptr;
2401  info->num_words = 0;
2402  do {
2403  if (word_res) {
2404  if (!lword) lword = word_res;
2405  if (rword != word_res) info->num_words++;
2406  rword = word_res;
2407  }
2408  word_res = page_res_it.forward();
2409  } while (page_res_it.row() == this_row);
2410 
2411  if (lword) info->lword_box = lword->word->bounding_box();
2412  if (rword) info->rword_box = rword->word->bounding_box();
2413 }
2414 
2415 
2416 // Given a Tesseract Iterator pointing to a text line, fill in the paragraph
2417 // detector RowInfo with all relevant information from the row.
2418 static void InitializeRowInfo(bool after_recognition,
2419  const MutableIterator &it, RowInfo *info) {
2420  if (it.PageResIt()->row() != nullptr) {
2421  ROW *row = it.PageResIt()->row()->row;
2422  info->pix_ldistance = row->lmargin();
2423  info->pix_rdistance = row->rmargin();
2424  info->average_interword_space =
2425  row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1);
2426  info->pix_xheight = row->x_height();
2427  info->has_leaders = false;
2428  info->has_drop_cap = row->has_drop_cap();
2429  info->ltr = true; // set below depending on word scripts
2430  } else {
2431  info->pix_ldistance = info->pix_rdistance = 0;
2432  info->average_interword_space = 1;
2433  info->pix_xheight = 1.0;
2434  info->has_leaders = false;
2435  info->has_drop_cap = false;
2436  info->ltr = true;
2437  }
2438 
2439  info->num_words = 0;
2440  info->lword_indicates_list_item = false;
2441  info->lword_likely_starts_idea = false;
2442  info->lword_likely_ends_idea = false;
2443  info->rword_indicates_list_item = false;
2444  info->rword_likely_starts_idea = false;
2445  info->rword_likely_ends_idea = false;
2446  info->has_leaders = false;
2447  info->ltr = 1;
2448 
2449  if (!after_recognition) {
2450  InitializeTextAndBoxesPreRecognition(it, info);
2451  return;
2452  }
2453  info->text = "";
2454  const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE));
2455  int trailing_ws_idx = strlen(text.get()); // strip trailing space
2456  while (trailing_ws_idx > 0 &&
2457  // isspace() only takes ASCII
2458  isascii(text[trailing_ws_idx - 1]) &&
2459  isspace(text[trailing_ws_idx - 1]))
2460  trailing_ws_idx--;
2461  if (trailing_ws_idx > 0) {
2462  int lspaces = info->pix_ldistance / info->average_interword_space;
2463  for (int i = 0; i < lspaces; i++)
2464  info->text += ' ';
2465  for (int i = 0; i < trailing_ws_idx; i++)
2466  info->text += text[i];
2467  }
2468 
2469  if (info->text.size() == 0) {
2470  return;
2471  }
2472 
2473  PAGE_RES_IT page_res_it = *it.PageResIt();
2475  WERD_RES *word_res = page_res_it.restart_row();
2476  ROW_RES *this_row = page_res_it.row();
2477  int num_leaders = 0;
2478  int ltr = 0;
2479  int rtl = 0;
2480  do {
2481  if (word_res && word_res->best_choice->unichar_string().length() > 0) {
2482  werds.push_back(word_res);
2483  ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
2484  rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
2485  if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
2486  }
2487  word_res = page_res_it.forward();
2488  } while (page_res_it.row() == this_row);
2489  info->ltr = ltr >= rtl;
2490  info->has_leaders = num_leaders > 3;
2491  info->num_words = werds.size();
2492  if (!werds.empty()) {
2493  WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
2494  info->lword_text = lword->best_choice->unichar_string().string();
2495  info->rword_text = rword->best_choice->unichar_string().string();
2496  info->lword_box = lword->word->bounding_box();
2497  info->rword_box = rword->word->bounding_box();
2498  LeftWordAttributes(lword->uch_set, lword->best_choice,
2499  info->lword_text,
2500  &info->lword_indicates_list_item,
2501  &info->lword_likely_starts_idea,
2502  &info->lword_likely_ends_idea);
2503  RightWordAttributes(rword->uch_set, rword->best_choice,
2504  info->rword_text,
2505  &info->rword_indicates_list_item,
2506  &info->rword_likely_starts_idea,
2507  &info->rword_likely_ends_idea);
2508  }
2509 }
2510 
2511 // This is called after rows have been identified and words are recognized.
2512 // Much of this could be implemented before word recognition, but text helps
2513 // to identify bulleted lists and gives good signals for sentence boundaries.
2514 void DetectParagraphs(int debug_level,
2515  bool after_text_recognition,
2516  const MutableIterator *block_start,
2518  // Clear out any preconceived notions.
2519  if (block_start->Empty(RIL_TEXTLINE)) {
2520  return;
2521  }
2522  BLOCK *block = block_start->PageResIt()->block()->block;
2523  block->para_list()->clear();
2524  bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText();
2525 
2526  // Convert the Tesseract structures to RowInfos
2527  // for the paragraph detection algorithm.
2528  MutableIterator row(*block_start);
2529  if (row.Empty(RIL_TEXTLINE))
2530  return; // end of input already.
2531 
2532  GenericVector<RowInfo> row_infos;
2533  do {
2534  if (!row.PageResIt()->row())
2535  continue; // empty row.
2536  row.PageResIt()->row()->row->set_para(nullptr);
2537  row_infos.push_back(RowInfo());
2538  RowInfo &ri = row_infos.back();
2539  InitializeRowInfo(after_text_recognition, row, &ri);
2540  } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
2541  row.Next(RIL_TEXTLINE));
2542 
2543  // If we're called before text recognition, we might not have
2544  // tight block bounding boxes, so trim by the minimum on each side.
2545  if (!row_infos.empty()) {
2546  int min_lmargin = row_infos[0].pix_ldistance;
2547  int min_rmargin = row_infos[0].pix_rdistance;
2548  for (int i = 1; i < row_infos.size(); i++) {
2549  if (row_infos[i].pix_ldistance < min_lmargin)
2550  min_lmargin = row_infos[i].pix_ldistance;
2551  if (row_infos[i].pix_rdistance < min_rmargin)
2552  min_rmargin = row_infos[i].pix_rdistance;
2553  }
2554  if (min_lmargin > 0 || min_rmargin > 0) {
2555  for (int i = 0; i < row_infos.size(); i++) {
2556  row_infos[i].pix_ldistance -= min_lmargin;
2557  row_infos[i].pix_rdistance -= min_rmargin;
2558  }
2559  }
2560  }
2561 
2562  // Run the paragraph detection algorithm.
2563  GenericVector<PARA *> row_owners;
2564  GenericVector<PARA *> the_paragraphs;
2565  if (!is_image_block) {
2566  DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
2567  models);
2568  } else {
2569  row_owners.init_to_size(row_infos.size(), nullptr);
2570  CanonicalizeDetectionResults(&row_owners, block->para_list());
2571  }
2572 
2573  // Now stitch in the row_owners into the rows.
2574  row = *block_start;
2575  for (int i = 0; i < row_owners.size(); i++) {
2576  while (!row.PageResIt()->row())
2577  row.Next(RIL_TEXTLINE);
2578  row.PageResIt()->row()->row->set_para(row_owners[i]);
2579  row.Next(RIL_TEXTLINE);
2580  }
2581 }
2582 
2583 } // namespace
const char * kRLE
Definition: unicodes.cpp:29
BLOCK_RES * block() const
Definition: pageres.h:757
int UNICHAR_ID
Definition: unichar.h:35
GenericVector< ParagraphModel * > & models()
int body_indent() const
Definition: ocrpara.h:169
int size() const
Definition: genericvector.h:71
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:514
const char * kPDF
Definition: unicodes.cpp:30
int first_uni() const
Definition: unichar.cpp:99
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
ROW_RES * row() const
Definition: pageres.h:754
bool has_drop_cap
Definition: ocrpara.h:46
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
Interval(int b, int e)
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:164
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
Definition: paragraphs.cpp:289
ParagraphJustification
Definition: publictypes.h:251
Cluster(int cen, int num)
Definition: paragraphs.cpp:660
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:456
ParagraphModel Model() const
Definition: paragraphs.cpp:936
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
Definition: paragraphs.cpp:299
const PAGE_RES_IT * PageResIt() const
const char * string() const
Definition: strngs.cpp:196
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:57
int count(LIST var_list)
Definition: oldlist.cpp:98
void GetClusters(GenericVector< Cluster > *clusters)
Definition: paragraphs.cpp:690
TBOX bounding_box() const
Definition: werd.cpp:159
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool is_list_item
Definition: ocrpara.h:38
void remove(int index)
void DiscardUnusedModels(const SetOfModels &used_models)
bool AnyRtlCharsInWord() const
Definition: pageres.h:391
int32_t space() const
Definition: ocrrow.h:79
static void AppendDebugHeaderFields(GenericVector< STRING > *header)
Definition: paragraphs.cpp:490
int AlignsideTabIndex(int row_idx) const
Definition: paragraphs.cpp:917
GeometricClassifierState(int dbg_level, GenericVector< RowScratchRegisters > *r, int r_start, int r_end)
Definition: paragraphs.cpp:868
Definition: statistc.h:33
const ParagraphModel * AddModel(const ParagraphModel &model)
void NonCenteredModels(SetOfModels *models)
bool contains(const T &object) const
const ParagraphModel * UniqueBodyHypothesis() const
Definition: paragraphs.cpp:638
int16_t rmargin() const
Definition: ocrrow.h:104
T & back() const
bool has_drop_cap() const
Definition: ocrrow.h:111
int16_t lmargin() const
Definition: ocrrow.h:101
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:409
int get_index(const T &object) const
void StartHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:611
int16_t width() const
Definition: rect.h:115
BLOCK * block
Definition: pageres.h:117
void CanonicalizeDetectionResults(GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs)
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:60
WERD_RES * restart_row()
Definition: pageres.cpp:1637
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:55
GenericVector< Cluster > right_tabs
Definition: paragraphs.cpp:959
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
double median() const
Definition: statistc.cpp:238
const GenericVector< Cluster > & AlignTabs() const
Definition: paragraphs.cpp:895
float x_height() const
Definition: ocrrow.h:64
Definition: ocrpara.h:29
tesseract::ParagraphJustification just
Definition: paragraphs.cpp:962
void init_to_size(int size, const T &t)
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:46
void AddStartLine(const ParagraphModel *model)
Definition: paragraphs.cpp:597
double ile(double frac) const
Definition: statistc.cpp:173
int push_back_new(const T &object)
int IndexOf(const ParagraphModel *model) const
int first_indent() const
Definition: ocrpara.h:168
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
void DiscardNonMatchingHypotheses(const SetOfModels &models)
Definition: paragraphs.cpp:645
Definition: ocrrow.h:36
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool IsText() const
Definition: polyblk.h:49
Definition: ocrblock.h:30
int tolerance() const
Definition: ocrpara.h:170
int length() const
Definition: ratngs.h:303
GenericVector< RowScratchRegisters > * rows
Definition: paragraphs.cpp:945
int OffsideIndent(tesseract::ParagraphJustification just) const
bool AnyLtrCharsInWord() const
Definition: pageres.h:408
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
void add(int32_t value, int32_t count)
Definition: statistc.cpp:100
int push_back(T object)
void set_para(PARA *p)
Definition: ocrrow.h:115
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
bool AsciiLikelyListItem(const STRING &word)
Definition: paragraphs.cpp:282
Definition: strngs.h:45
SimpleClusterer(int max_cluster_width)
Definition: paragraphs.cpp:668
void Init(const RowInfo &row)
Definition: paragraphs.cpp:527
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel *> *models)
GenericVectorEqEq< const ParagraphModel * > SetOfModels
const ParagraphModel * model
Definition: ocrpara.h:36
virtual bool Next(PageIteratorLevel level)
const UNICHARSET * uch_set
Definition: pageres.h:206
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
int32_t size() const
Definition: strngs.h:69
bool StrongModel(const ParagraphModel *model)
const STRING & unichar_string() const
Definition: ratngs.h:541
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
void AddBodyLine(const ParagraphModel *model)
Definition: paragraphs.cpp:604
const GenericVector< Cluster > & OffsideTabs() const
Definition: paragraphs.cpp:905
int average_interword_space
Definition: paragraphs.h:53
bool RowsFitModel(const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
GenericVector< Cluster > left_tabs
Definition: paragraphs.cpp:958
bool Empty(PageIteratorLevel level) const
bool is_flush() const
Definition: ocrpara.h:171
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:111
ParagraphModelSmearer(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
void StrongHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:618
WERD_RES * forward()
Definition: pageres.h:731
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500
bool FirstWordWouldHaveFit(int row_a, int row_b)
Definition: paragraphs.cpp:923
PARA_LIST * para_list()
Definition: ocrblock.h:126
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
void NonNullHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:625
void Fail(int min_debug_level, const char *why) const
Definition: paragraphs.cpp:930
PDBLK pdblk
Definition: ocrblock.h:192
int32_t length() const
Definition: strngs.cpp:191
WERD_CHOICE * best_choice
Definition: pageres.h:235
bool is_very_first_or_continuation
Definition: ocrpara.h:43
const ParagraphModel * UniqueStartHypothesis() const
Definition: paragraphs.cpp:632
void AppendDebugInfo(const ParagraphTheory &theory, GenericVector< STRING > *dbg) const
Definition: paragraphs.cpp:496
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:121
ROW * row
Definition: pageres.h:143
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:63
const ParagraphModel * Fits(const GenericVector< RowScratchRegisters > *rows, int start, int end) const
WERD * word
Definition: pageres.h:189