tessapi/3.x/a00716_source.html

 /**********************************************************************

  * File:        paragraphs.cpp

  * Description: Paragraph detection for tesseract.

  * Author:      David Eger

  * Created:     25 February 2011

  *

  * (C) Copyright 2011, Google Inc.

  ** Licensed under the Apache License, Version 2.0 (the "License");

  ** you may not use this file except in compliance with the License.

  ** You may obtain a copy of the License at

  ** http://www.apache.org/licenses/LICENSE-2.0

  ** Unless required by applicable law or agreed to in writing, software

  ** distributed under the License is distributed on an "AS IS" BASIS,

  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  ** See the License for the specific language governing permissions and

  ** limitations under the License.

  *

  **********************************************************************/

 #ifdef _MSC_VER

 #define __func__ __FUNCTION__

 #endif


 #include <ctype.h>


 #include "genericvector.h"

 #include "helpers.h"

 #include "mutableiterator.h"

 #include "ocrpara.h"

 #include "pageres.h"

 #include "paragraphs.h"

 #include "paragraphs_internal.h"

 #include "publictypes.h"

 #include "ratngs.h"

 #include "rect.h"

 #include "statistc.h"

 #include "strngs.h"

 #include "tprintf.h"

 #include "unicharset.h"

 #include "unicodes.h"


 namespace tesseract {


 // Special "weak" ParagraphModels.

 const ParagraphModel *kCrownLeft

     = reinterpret_cast<ParagraphModel *>(0xDEAD111F);

 const ParagraphModel *kCrownRight

     = reinterpret_cast<ParagraphModel *>(0xDEAD888F);


 // Given the width of a typical space between words, what is the threshold

 // by which by which we think left and right alignments for paragraphs

 // can vary and still be aligned.

 static int Epsilon(int space_pix) {

   return space_pix * 4 / 5;

 }


 static bool AcceptableRowArgs(

     int debug_level, int min_num_rows, const char *function_name,

     const GenericVector<RowScratchRegisters> *rows,

     int row_start, int row_end) {

   if (row_start < 0 || row_end > rows->size() || row_start > row_end) {

     tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n",

             row_start, row_end, rows->size());

     return false;

   }

   if (row_end - row_start < min_num_rows) {

     if (debug_level > 1) {

       tprintf("# Too few rows[%d, %d) for %s.\n",

               row_start, row_end, function_name);

     }

     return false;

   }

   return true;

 }


 // =============================== Debug Code ================================


 // Convert an integer to a decimal string.

 static STRING StrOf(int num) {

   char buffer[30];

   snprintf(buffer, sizeof(buffer), "%d", num);

   return STRING(buffer);

 }


 // Given a row-major matrix of unicode text and a column separator, print

 // a formatted table.  For ASCII, we get good column alignment.

 static void PrintTable(const GenericVector<GenericVector<STRING> > &rows,

                        const STRING &colsep) {

   GenericVector<int> max_col_widths;

   for (int r = 0; r < rows.size(); r++) {

     int num_columns = rows[r].size();

     for (int c = 0; c < num_columns; c++) {

       int num_unicodes = 0;

       for (int i = 0; i < rows[r][c].size(); i++) {

         if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;

       }

       if (c >= max_col_widths.size()) {

         max_col_widths.push_back(num_unicodes);

       } else {

         if (num_unicodes > max_col_widths[c])

           max_col_widths[c] = num_unicodes;

       }

     }

   }


   GenericVector<STRING> col_width_patterns;

   for (int c = 0; c < max_col_widths.size(); c++) {

     col_width_patterns.push_back(

         STRING("%-") + StrOf(max_col_widths[c]) + "s");

   }


   for (int r = 0; r < rows.size(); r++) {

     for (int c = 0; c < rows[r].size(); c++) {

       if (c > 0)

         tprintf("%s", colsep.string());

       tprintf(col_width_patterns[c].string(), rows[r][c].string());

     }

     tprintf("\n");

   }

 }


 STRING RtlEmbed(const STRING &word, bool rtlify) {

   if (rtlify)

     return STRING(kRLE) + word + STRING(kPDF);

   return word;

 }


 // Print the current thoughts of the paragraph detector.

 static void PrintDetectorState(const ParagraphTheory &theory,

                                const GenericVector<RowScratchRegisters> &rows) {

   GenericVector<GenericVector<STRING> > output;

   output.push_back(GenericVector<STRING>());

   output.back().push_back("#row");

   output.back().push_back("space");

   output.back().push_back("..");

   output.back().push_back("lword[widthSEL]");

   output.back().push_back("rword[widthSEL]");

   RowScratchRegisters::AppendDebugHeaderFields(&output.back());

   output.back().push_back("text");


   for (int i = 0; i < rows.size(); i++) {

     output.push_back(GenericVector<STRING>());

     GenericVector<STRING> &row = output.back();

     const RowInfo& ri = *rows[i].ri_;

     row.push_back(StrOf(i));

     row.push_back(StrOf(ri.average_interword_space));

     row.push_back(ri.has_leaders ? ".." : " ");

     row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) +

                   "[" + StrOf(ri.lword_box.width()) +

                   (ri.lword_likely_starts_idea ? "S" : "s") +

                   (ri.lword_likely_ends_idea ? "E" : "e") +

                   (ri.lword_indicates_list_item ? "L" : "l") +

                   "]");

     row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) +

                   "[" + StrOf(ri.rword_box.width()) +

                   (ri.rword_likely_starts_idea ? "S" : "s") +

                   (ri.rword_likely_ends_idea ? "E" : "e") +

                   (ri.rword_indicates_list_item ? "L" : "l") +

                   "]");

     rows[i].AppendDebugInfo(theory, &row);

     row.push_back(RtlEmbed(ri.text, !ri.ltr));

   }

   PrintTable(output, " ");


   tprintf("Active Paragraph Models:\n");

   for (int m = 0; m < theory.models().size(); m++) {

     tprintf(" %d: %s\n", m + 1, theory.models()[m]->ToString().string());

   }

 }


 static void DebugDump(

     bool should_print,

     const STRING &phase,

     const ParagraphTheory &theory,

     const GenericVector<RowScratchRegisters> &rows) {

   if (!should_print)

     return;

   tprintf("# %s\n", phase.string());

   PrintDetectorState(theory, rows);

 }


 // Print out the text for rows[row_start, row_end)

 static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows,

                           int row_start, int row_end) {

   tprintf("======================================\n");

   for (int row = row_start; row < row_end; row++) {

     tprintf("%s\n", rows[row].ri_->text.string());

   }

   tprintf("======================================\n");

 }


 // ============= Brain Dead Language Model (ASCII Version) ===================


 bool IsLatinLetter(int ch) {

   return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');

 }


 bool IsDigitLike(int ch) {

   return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';

 }


 bool IsOpeningPunct(int ch) {

   return strchr("'\"({[", ch) != NULL;

 }


 bool IsTerminalPunct(int ch) {

   return strchr(":'\".?!]})", ch) != NULL;

 }


 // Return a pointer after consuming as much text as qualifies as roman numeral.

 const char *SkipChars(const char *str, const char *toskip) {

   while (*str != '\0' && strchr(toskip, *str)) { str++; }

   return str;

 }


 const char *SkipChars(const char *str, bool (*skip)(int)) {

   while (*str != '\0' && skip(*str)) { str++; }

   return str;

 }


 const char *SkipOne(const char *str, const char *toskip) {

   if (*str != '\0' && strchr(toskip, *str)) return str + 1;

   return str;

 }


 // Return whether it is very likely that this is a numeral marker that could

 // start a list item.  Some examples include:

 //   A   I   iii.   VI   (2)   3.5.   [C-4]

 bool LikelyListNumeral(const STRING &word) {

   const char *kRomans = "ivxlmdIVXLMD";

   const char *kDigits = "012345789";

   const char *kOpen = "[{(";

   const char *kSep = ":;-.,";

   const char *kClose = "]})";


   int num_segments = 0;

   const char *pos = word.string();

   while (*pos != '\0' && num_segments < 3) {

     // skip up to two open parens.

     const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);

     const char *numeral_end = SkipChars(numeral_start, kRomans);

     if (numeral_end != numeral_start) {

       // Got Roman Numeral. Great.

     } else {

       numeral_end = SkipChars(numeral_start, kDigits);

       if (numeral_end == numeral_start) {

         // If there's a single latin letter, we can use that.

         numeral_end = SkipChars(numeral_start, IsLatinLetter);

         if (numeral_end - numeral_start != 1)

           break;

       }

     }

     // We got some sort of numeral.

     num_segments++;

     // Skip any trailing parens or punctuation.

     pos = SkipChars(SkipChars(numeral_end, kClose), kSep);

     if (pos == numeral_end)

       break;

   }

   return *pos == '\0';

 }


 bool LikelyListMark(const STRING &word) {

   const char *kListMarks = "0Oo*.,+.";

   return word.size() == 1 && strchr(kListMarks, word[0]) != NULL;

 }


 bool AsciiLikelyListItem(const STRING &word) {

   return LikelyListMark(word) || LikelyListNumeral(word);

 }


 // ========== Brain Dead Language Model (Tesseract Version) ================


 // Return the first Unicode Codepoint from werd[pos].

 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {

   if (!u || !werd || pos > werd->length())

     return 0;

   return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();

 }


 // A useful helper class for finding the first j >= i so that word[j]

 // does not have given character type.

 class UnicodeSpanSkipper {

  public:

   UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)

       : u_(unicharset), word_(word) { wordlen_ = word->length(); }


   // Given an input position, return the first position >= pos not punc.

   int SkipPunc(int pos);

   // Given an input position, return the first position >= pos not digit.

   int SkipDigits(int pos);

   // Given an input position, return the first position >= pos not roman.

   int SkipRomans(int pos);

   // Given an input position, return the first position >= pos not alpha.

   int SkipAlpha(int pos);


  private:

   const UNICHARSET *u_;

   const WERD_CHOICE *word_;

   int wordlen_;

 };


 int UnicodeSpanSkipper::SkipPunc(int pos) {

   while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++;

   return pos;

 }


 int UnicodeSpanSkipper::SkipDigits(int pos) {

   while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) ||

                             IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++;

   return pos;

 }


 int UnicodeSpanSkipper::SkipRomans(int pos) {

   const char *kRomans = "ivxlmdIVXLMD";

   while (pos < wordlen_) {

     int ch = UnicodeFor(u_, word_, pos);

     if (ch >= 0xF0 || strchr(kRomans, ch) == 0) break;

     pos++;

   }

   return pos;

 }


 int UnicodeSpanSkipper::SkipAlpha(int pos) {

   while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++;

   return pos;

 }


 bool LikelyListMarkUnicode(int ch) {

   if (ch < 0x80) {

     STRING single_ch;

     single_ch += ch;

     return LikelyListMark(single_ch);

   }

   switch (ch) {

     // TODO(eger) expand this list of unicodes as needed.

     case 0x00B0:  // degree sign

     case 0x2022:  // bullet

     case 0x25E6:  // white bullet

     case 0x00B7:  // middle dot

     case 0x25A1:  // white square

     case 0x25A0:  // black square

     case 0x25AA:  // black small square

     case 0x2B1D:  // black very small square

     case 0x25BA:  // black right-pointing pointer

     case 0x25CF:  // black circle

     case 0x25CB:  // white circle

       return true;

     default:

       break;  // fall through

   }

   return false;

 }


 // Return whether it is very likely that this is a numeral marker that could

 // start a list item.  Some examples include:

 //   A   I   iii.   VI   (2)   3.5.   [C-4]

 bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {

   if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))

     return true;


   UnicodeSpanSkipper m(u, werd);

   int num_segments = 0;

   int pos = 0;

   while (pos < werd->length() && num_segments < 3) {

     int numeral_start = m.SkipPunc(pos);

     if (numeral_start > pos + 1) break;

     int numeral_end = m.SkipRomans(numeral_start);

     if (numeral_end == numeral_start) {

       numeral_end = m.SkipDigits(numeral_start);

       if (numeral_end == numeral_start) {

         // If there's a single latin letter, we can use that.

         numeral_end = m.SkipAlpha(numeral_start);

         if (numeral_end - numeral_start != 1)

           break;

       }

     }

     // We got some sort of numeral.

     num_segments++;

     // Skip any trailing punctuation.

     pos = m.SkipPunc(numeral_end);

     if (pos == numeral_end)

       break;

   }

   return pos == werd->length();

 }


 // ========= Brain Dead Language Model (combined entry points) ================


 // Given the leftmost word of a line either as a Tesseract unicharset + werd

 // or a utf8 string, set the following attributes for it:

 //   is_list -      this word might be a list number or bullet.

 //   starts_idea -  this word is likely to start a sentence.

 //   ends_idea -    this word is likely to end a sentence.

 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,

                         const STRING &utf8,

                         bool *is_list, bool *starts_idea, bool *ends_idea) {

   *is_list = false;

   *starts_idea = false;

   *ends_idea = false;

   if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) {  // Empty

     *ends_idea = true;

     return;

   }


   if (unicharset && werd) {  // We have a proper werd and unicharset so use it.

     if (UniLikelyListItem(unicharset, werd)) {

       *is_list = true;

       *starts_idea = true;

       *ends_idea = true;

     }

     if (unicharset->get_isupper(werd->unichar_id(0))) {

       *starts_idea = true;

     }

     if (unicharset->get_ispunctuation(werd->unichar_id(0))) {

       *starts_idea = true;

       *ends_idea = true;

     }

   } else {  // Assume utf8 is mostly ASCII

     if (AsciiLikelyListItem(utf8)) {

       *is_list = true;

       *starts_idea = true;

     }

     int start_letter = utf8[0];

     if (IsOpeningPunct(start_letter)) {

       *starts_idea = true;

     }

     if (IsTerminalPunct(start_letter)) {

       *ends_idea = true;

     }

     if (start_letter >= 'A' && start_letter <= 'Z') {

       *starts_idea = true;

     }

   }

 }


 // Given the rightmost word of a line either as a Tesseract unicharset + werd

 // or a utf8 string, set the following attributes for it:

 //   is_list -      this word might be a list number or bullet.

 //   starts_idea -  this word is likely to start a sentence.

 //   ends_idea -    this word is likely to end a sentence.

 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,

                          const STRING &utf8,

                          bool *is_list, bool *starts_idea, bool *ends_idea) {

   *is_list = false;

   *starts_idea = false;

   *ends_idea = false;

   if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) {  // Empty

     *ends_idea = true;

     return;

   }


   if (unicharset && werd) {  // We have a proper werd and unicharset so use it.

     if (UniLikelyListItem(unicharset, werd)) {

       *is_list = true;

       *starts_idea = true;

     }

     UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);

     if (unicharset->get_ispunctuation(last_letter)) {

       *ends_idea = true;

     }

   } else {  // Assume utf8 is mostly ASCII

     if (AsciiLikelyListItem(utf8)) {

       *is_list = true;

       *starts_idea = true;

     }

     int last_letter = utf8[utf8.size() - 1];

     if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {

       *ends_idea = true;

     }

   }

 }


 // =============== Implementation of RowScratchRegisters =====================

 /* static */

 void RowScratchRegisters::AppendDebugHeaderFields(

     GenericVector<STRING> *header) {

   header->push_back("[lmarg,lind;rind,rmarg]");

   header->push_back("model");

 }


 void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,

                                           GenericVector<STRING> *dbg) const {

   char s[30];

   snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]",

            lmargin_, lindent_, rindent_, rmargin_);

   dbg->push_back(s);

   STRING model_string;

   model_string += static_cast<char>(GetLineType());

   model_string += ":";


   int model_numbers = 0;

   for (int h = 0; h < hypotheses_.size(); h++) {

     if (hypotheses_[h].model == NULL)

       continue;

     if (model_numbers > 0)

       model_string += ",";

     if (StrongModel(hypotheses_[h].model)) {

       model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));

     } else if (hypotheses_[h].model == kCrownLeft) {

       model_string += "CrL";

     } else if (hypotheses_[h].model == kCrownRight) {

       model_string += "CrR";

     }

     model_numbers++;

   }

   if (model_numbers == 0)

     model_string += "0";


   dbg->push_back(model_string);

 }


 void RowScratchRegisters::Init(const RowInfo &row) {

   ri_ = &row;

   lmargin_ = 0;

   lindent_ = row.pix_ldistance;

   rmargin_ = 0;

   rindent_ = row.pix_rdistance;

 }


 LineType RowScratchRegisters::GetLineType() const {

   if (hypotheses_.empty())

     return LT_UNKNOWN;

   bool has_start = false;

   bool has_body = false;

   for (int i = 0; i < hypotheses_.size(); i++) {

     switch (hypotheses_[i].ty) {

       case LT_START: has_start = true; break;

       case LT_BODY: has_body = true; break;

       default:

         tprintf("Encountered bad value in hypothesis list: %c\n",

                 hypotheses_[i].ty);

         break;

     }

   }

   if (has_start && has_body)

     return LT_MULTIPLE;

   return has_start ? LT_START : LT_BODY;

 }


 LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const {

   if (hypotheses_.empty())

     return LT_UNKNOWN;

   bool has_start = false;

   bool has_body = false;

   for (int i = 0; i < hypotheses_.size(); i++) {

     if (hypotheses_[i].model != model)

       continue;

     switch (hypotheses_[i].ty) {

       case LT_START: has_start = true; break;

       case LT_BODY: has_body = true; break;

       default:

         tprintf("Encountered bad value in hypothesis list: %c\n",

                 hypotheses_[i].ty);

         break;

     }

   }

   if (has_start && has_body)

     return LT_MULTIPLE;

   return has_start ? LT_START : LT_BODY;

 }


 void RowScratchRegisters::SetStartLine() {

   LineType current_lt = GetLineType();

   if (current_lt != LT_UNKNOWN && current_lt != LT_START) {

     tprintf("Trying to set a line to be START when it's already BODY.\n");

   }

   if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {

     hypotheses_.push_back_new(LineHypothesis(LT_START, NULL));

   }

 }


 void RowScratchRegisters::SetBodyLine() {

   LineType current_lt = GetLineType();

   if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {

     tprintf("Trying to set a line to be BODY when it's already START.\n");

   }

   if (current_lt == LT_UNKNOWN || current_lt == LT_START) {

     hypotheses_.push_back_new(LineHypothesis(LT_BODY, NULL));

   }

 }


 void RowScratchRegisters::AddStartLine(const ParagraphModel *model) {

   hypotheses_.push_back_new(LineHypothesis(LT_START, model));

   int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, NULL));

   if (old_idx >= 0)

     hypotheses_.remove(old_idx);

 }


 void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {

   hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));

   int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, NULL));

   if (old_idx >= 0)

     hypotheses_.remove(old_idx);

 }


 void RowScratchRegisters::StartHypotheses(SetOfModels *models) const {

   for (int h = 0; h < hypotheses_.size(); h++) {

     if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))

       models->push_back_new(hypotheses_[h].model);

   }

 }


 void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {

   for (int h = 0; h < hypotheses_.size(); h++) {

     if (StrongModel(hypotheses_[h].model))

       models->push_back_new(hypotheses_[h].model);

   }

 }


 void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {

   for (int h = 0; h < hypotheses_.size(); h++) {

     if (hypotheses_[h].model != NULL)

       models->push_back_new(hypotheses_[h].model);

   }

 }


 const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const {

   if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START)

     return NULL;

   return hypotheses_[0].model;

 }


 const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const {

   if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY)

     return NULL;

   return hypotheses_[0].model;

 }


 // Discard any hypotheses whose model is not in the given list.

 void RowScratchRegisters::DiscardNonMatchingHypotheses(

     const SetOfModels &models) {

   if (models.empty())

     return;

   for (int h = hypotheses_.size() - 1; h >= 0; h--) {

     if (!models.contains(hypotheses_[h].model)) {

       hypotheses_.remove(h);

     }

   }

 }


 // ============ Geometry based Paragraph Detection Algorithm =================


 struct Cluster {

   Cluster() : center(0), count(0) {}

   Cluster(int cen, int num) : center(cen), count(num) {}


   int center;  // The center of the cluster.

   int count;   // The number of entries within the cluster.

 };


 class SimpleClusterer {

  public:

   explicit SimpleClusterer(int max_cluster_width)

       : max_cluster_width_(max_cluster_width) {}

   void Add(int value) { values_.push_back(value); }

   int size() const { return values_.size(); }

   void GetClusters(GenericVector<Cluster> *clusters);


  private:

   int max_cluster_width_;

   GenericVectorEqEq<int> values_;

 };


 // Return the index of the cluster closest to value.

 int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {

   int best_index = 0;

   for (int i = 0; i < clusters.size(); i++) {

     if (abs(value - clusters[i].center) <

         abs(value - clusters[best_index].center))

         best_index = i;

   }

   return best_index;

 }


 void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) {

   clusters->clear();

   values_.sort();

   for (int i = 0; i < values_.size();) {

     int orig_i = i;

     int lo = values_[i];

     int hi = lo;

     while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {

       hi = values_[i];

     }

     clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));

   }

 }


 // Calculate left- and right-indent tab stop values seen in

 // rows[row_start, row_end) given a tolerance of tolerance.

 void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,

                        int row_start, int row_end,

                        int tolerance,

                        GenericVector<Cluster> *left_tabs,

                        GenericVector<Cluster> *right_tabs) {

   if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))

     return;

   // First pass: toss all left and right indents into clusterers.

   SimpleClusterer initial_lefts(tolerance);

   SimpleClusterer initial_rights(tolerance);

   GenericVector<Cluster> initial_left_tabs;

   GenericVector<Cluster> initial_right_tabs;

   for (int i = row_start; i < row_end; i++) {

     initial_lefts.Add((*rows)[i].lindent_);

     initial_rights.Add((*rows)[i].rindent_);

   }

   initial_lefts.GetClusters(&initial_left_tabs);

   initial_rights.GetClusters(&initial_right_tabs);


   // Second pass: cluster only lines that are not "stray"

   //   An example of a stray line is a page number -- a line whose start

   //   and end tab-stops are far outside the typical start and end tab-stops

   //   for the block.

   //   Put another way, we only cluster data from lines whose start or end

   //   tab stop is frequent.

   SimpleClusterer lefts(tolerance);

   SimpleClusterer rights(tolerance);


   // Outlier elimination.  We might want to switch this to test outlier-ness

   // based on how strange a position an outlier is in instead of or in addition

   // to how rare it is.  These outliers get re-added if we end up having too

   // few tab stops, to work with, however.

   int infrequent_enough_to_ignore = 0;

   if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;

   if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;


   for (int i = row_start; i < row_end; i++) {

     int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);

     int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);

     if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||

         initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {

       lefts.Add((*rows)[i].lindent_);

       rights.Add((*rows)[i].rindent_);

     }

   }

   lefts.GetClusters(left_tabs);

   rights.GetClusters(right_tabs);


   if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||

       (right_tabs->size() == 1 && left_tabs->size() >= 4)) {

     // One side is really ragged, and the other only has one tab stop,

     // so those "insignificant outliers" are probably important, actually.

     // This often happens on a page of an index.  Add back in the ones

     // we omitted in the first pass.

     for (int i = row_start; i < row_end; i++) {

       int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);

       int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);

       if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||

             initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {

         lefts.Add((*rows)[i].lindent_);

         rights.Add((*rows)[i].rindent_);

       }

     }

   }

   lefts.GetClusters(left_tabs);

   rights.GetClusters(right_tabs);


   // If one side is almost a two-indent aligned side, and the other clearly

   // isn't, try to prune out the least frequent tab stop from that side.

   if (left_tabs->size() == 3 && right_tabs->size() >= 4) {

     int to_prune = -1;

     for (int i = left_tabs->size() - 1; i >= 0; i--) {

       if (to_prune < 0 ||

           (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {

         to_prune = i;

       }

     }

     if (to_prune >= 0 &&

         (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {

       left_tabs->remove(to_prune);

     }

   }

   if (right_tabs->size() == 3 && left_tabs->size() >= 4) {

     int to_prune = -1;

     for (int i = right_tabs->size() - 1; i >= 0; i--) {

       if (to_prune < 0 ||

           (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {

         to_prune = i;

       }

     }

     if (to_prune >= 0 &&

         (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {

       right_tabs->remove(to_prune);

     }

   }

 }


 // Given a paragraph model mark rows[row_start, row_end) as said model

 // start or body lines.

 //

 // Case 1: model->first_indent_ != model->body_indent_

 //   Differentiating the paragraph start lines from the paragraph body lines in

 //   this case is easy, we just see how far each line is indented.

 //

 // Case 2: model->first_indent_ == model->body_indent_

 //   Here, we find end-of-paragraph lines by looking for "short lines."

 //   What constitutes a "short line" changes depending on whether the text

 //   ragged-right[left] or fully justified (aligned left and right).

 //

 //   Case 2a: Ragged Right (or Left) text.  (eop_threshold == 0)

 //     We have a new paragraph it the first word would have at the end

 //     of the previous line.

 //

 //   Case 2b: Fully Justified.  (eop_threshold > 0)

 //     We mark a line as short (end of paragraph) if the offside indent

 //     is greater than eop_threshold.

 void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows,

                        int row_start, int row_end,

                        const ParagraphModel *model,

                        bool ltr,

                        int eop_threshold) {

   if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))

     return;

   for (int row = row_start; row < row_end; row++) {

     bool valid_first = ValidFirstLine(rows, row, model);

     bool valid_body = ValidBodyLine(rows, row, model);

     if (valid_first && !valid_body) {

       (*rows)[row].AddStartLine(model);

     } else if (valid_body && !valid_first) {

       (*rows)[row].AddBodyLine(model);

     } else if (valid_body && valid_first) {

       bool after_eop = (row == row_start);

       if (row > row_start) {

         if (eop_threshold > 0) {

           if (model->justification() == JUSTIFICATION_LEFT) {

             after_eop = (*rows)[row - 1].rindent_ > eop_threshold;

           } else {

             after_eop = (*rows)[row - 1].lindent_ > eop_threshold;

           }

         } else {

          after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],

                                            model->justification());

         }

       }

       if (after_eop) {

         (*rows)[row].AddStartLine(model);

       } else {

         (*rows)[row].AddBodyLine(model);

       }

     } else {

       // Do nothing. Stray row.

     }

   }

 }


 // GeometricClassifierState holds all of the information we'll use while

 // trying to determine a paragraph model for the text lines in a block of

 // text:

 //   + the rows under consideration [row_start, row_end)

 //   + the common left- and right-indent tab stops

 //   + does the block start out left-to-right or right-to-left

 // Further, this struct holds the data we amass for the (single) ParagraphModel

 // we'll assign to the text lines (assuming we get that far).

 struct GeometricClassifierState {

   GeometricClassifierState(int dbg_level,

                            GenericVector<RowScratchRegisters> *r,

                            int r_start, int r_end)

       : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end),

         margin(0) {

     tolerance = InterwordSpace(*r, r_start, r_end);

     CalculateTabStops(r, r_start, r_end, tolerance,

                       &left_tabs, &right_tabs);

     if (debug_level >= 3) {

       tprintf("Geometry: TabStop cluster tolerance = %d; "

               "%d left tabs; %d right tabs\n",

               tolerance, left_tabs.size(), right_tabs.size());

     }

     ltr = (*r)[r_start].ri_->ltr;

   }


   void AssumeLeftJustification() {

     just = tesseract::JUSTIFICATION_LEFT;

     margin = (*rows)[row_start].lmargin_;

   }


   void AssumeRightJustification() {

     just = tesseract::JUSTIFICATION_RIGHT;

     margin = (*rows)[row_start].rmargin_;

   }


   // Align tabs are the tab stops the text is aligned to.

   const GenericVector<Cluster> &AlignTabs() const {

     if (just == tesseract::JUSTIFICATION_RIGHT) return right_tabs;

     return left_tabs;

   }


   // Offside tabs are the tab stops opposite the tabs used to align the text.

   //

   // Note that for a left-to-right text which is aligned to the right such as

   //     this function comment, the offside tabs are the horizontal tab stops

   //                 marking the beginning of ("Note", "this" and "marking").

   const GenericVector<Cluster> &OffsideTabs() const {

     if (just == tesseract::JUSTIFICATION_RIGHT) return left_tabs;

     return right_tabs;

   }


   // Return whether the i'th row extends from the leftmost left tab stop

   // to the right most right tab stop.

   bool IsFullRow(int i) const {

     return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&

         ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;

   }


   int AlignsideTabIndex(int row_idx) const {

     return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));

   }


   // Given what we know about the paragraph justification (just), would the

   // first word of row_b have fit at the end of row_a?

   bool FirstWordWouldHaveFit(int row_a, int row_b) {

     return ::tesseract::FirstWordWouldHaveFit(

         (*rows)[row_a], (*rows)[row_b], just);

   }


   void PrintRows() const { PrintRowRange(*rows, row_start, row_end); }


   void Fail(int min_debug_level, const char *why) const {

     if (debug_level < min_debug_level) return;

     tprintf("# %s\n", why);

     PrintRows();

   }


   ParagraphModel Model() const {

     return ParagraphModel(just, margin, first_indent, body_indent, tolerance);

   }


   // We print out messages with a debug level at least as great as debug_level.

   int debug_level;


   // The Geometric Classifier was asked to find a single paragraph model

   // to fit the text rows (*rows)[row_start, row_end)

   GenericVector<RowScratchRegisters> *rows;

   int row_start;

   int row_end;


   // The amount by which we expect the text edge can vary and still be aligned.

   int tolerance;


   // Is the script in this text block left-to-right?

   // HORRIBLE ROUGH APPROXIMATION.  TODO(eger): Improve

   bool ltr;


   // These left and right tab stops were determined to be the common tab

   // stops for the given text.

   GenericVector<Cluster> left_tabs;

   GenericVector<Cluster> right_tabs;


   // These are parameters we must determine to create a ParagraphModel.

   tesseract::ParagraphJustification just;

   int margin;

   int first_indent;

   int body_indent;


   // eop_threshold > 0 if the text is fully justified.  See MarkRowsWithModel()

   int eop_threshold;

 };


 // Given a section of text where strong textual clues did not help identifying

 // paragraph breaks, and for which the left and right indents have exactly

 // three tab stops between them, attempt to find the paragraph breaks based

 // solely on the outline of the text and whether the script is left-to-right.

 //

 // Algorithm Detail:

 //   The selected rows are in the form of a rectangle except

 //   for some number of "short lines" of the same length:

 //

 //   (A1)  xxxxxxxxxxxxx  (B1) xxxxxxxxxxxx

 //           xxxxxxxxxxx       xxxxxxxxxx    # A "short" line.

 //         xxxxxxxxxxxxx       xxxxxxxxxxxx

 //         xxxxxxxxxxxxx       xxxxxxxxxxxx

 //

 //   We have a slightly different situation if the only short

 //   line is at the end of the excerpt.

 //

 //   (A2) xxxxxxxxxxxxx  (B2) xxxxxxxxxxxx

 //        xxxxxxxxxxxxx       xxxxxxxxxxxx

 //        xxxxxxxxxxxxx       xxxxxxxxxxxx

 //          xxxxxxxxxxx       xxxxxxxxxx     # A "short" line.

 //

 //   We'll interpret these as follows based on the reasoning in the comment for

 //   GeometricClassify():

 //       [script direction: first indent, body indent]

 //   (A1) LtR: 2,0  RtL: 0,0   (B1) LtR: 0,0  RtL: 2,0

 //   (A2) LtR: 2,0  RtL: CrR   (B2) LtR: CrL  RtL: 2,0

 void GeometricClassifyThreeTabStopTextBlock(

     int debug_level,

     GeometricClassifierState &s,

     ParagraphTheory *theory) {

   int num_rows = s.row_end - s.row_start;

   int num_full_rows = 0;

   int last_row_full = 0;

   for (int i = s.row_start; i < s.row_end; i++) {

     if (s.IsFullRow(i)) {

       num_full_rows++;

       if (i == s.row_end - 1) last_row_full++;

     }

   }


   if (num_full_rows < 0.7 * num_rows) {

     s.Fail(1, "Not enough full lines to know which lines start paras.");

     return;

   }


   // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()

   s.eop_threshold = 0;


   if (s.ltr) {

     s.AssumeLeftJustification();

   } else {

     s.AssumeRightJustification();

   }


   if (debug_level > 0) {

     tprintf("# Not enough variety for clear outline classification. "

             "Guessing these are %s aligned based on script.\n",

             s.ltr ? "left" : "right");

     s.PrintRows();

   }


   if (s.AlignTabs().size() == 2) {  // case A1 or A2

     s.first_indent = s.AlignTabs()[1].center;

     s.body_indent = s.AlignTabs()[0].center;

   } else {                      // case B1 or B2

     if (num_rows - 1 == num_full_rows - last_row_full) {

       // case B2

       const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;

       (*s.rows)[s.row_start].AddStartLine(model);

       for (int i = s.row_start + 1; i < s.row_end; i++) {

         (*s.rows)[i].AddBodyLine(model);

       }

       return;

     } else {

       // case B1

       s.first_indent = s.body_indent = s.AlignTabs()[0].center;

       s.eop_threshold = (s.OffsideTabs()[0].center +

                          s.OffsideTabs()[1].center) / 2;

     }

   }

   const ParagraphModel *model = theory->AddModel(s.Model());

   MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,

                     s.ltr, s.eop_threshold);

   return;

 }


 // This function is called if strong textual clues were not available, but

 // the caller hopes that the paragraph breaks will be super obvious just

 // by the outline of the text.

 //

 // The particularly difficult case is figuring out what's going on if you

 // don't have enough short paragraph end lines to tell us what's going on.

 //

 // For instance, let's say you have the following outline:

 //

 //   (A1)  xxxxxxxxxxxxxxxxxxxxxx

 //           xxxxxxxxxxxxxxxxxxxx

 //         xxxxxxxxxxxxxxxxxxxxxx

 //         xxxxxxxxxxxxxxxxxxxxxx

 //

 // Even if we know that the text is left-to-right and so will probably be

 // left-aligned, both of the following are possible texts:

 //

 //  (A1a)  1. Here our list item

 //           with two full lines.

 //         2. Here a second item.

 //         3. Here our third one.

 //

 //  (A1b)  so ends paragraph one.

 //           Here  starts another

 //         paragraph  we want  to

 //         read.  This  continues

 //

 // These examples are obvious from the text and should have been caught

 // by the StrongEvidenceClassify pass.  However, for languages where we don't

 // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),

 // it's worth guessing that (A1b) is the correct interpretation if there are

 // far more "full" lines than "short" lines.

 void GeometricClassify(int debug_level,

                        GenericVector<RowScratchRegisters> *rows,

                        int row_start, int row_end,

                        ParagraphTheory *theory) {

   if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))

     return;

   if (debug_level > 1) {

     tprintf("###############################################\n");

     tprintf("##### GeometricClassify( rows[%d:%d) )   ####\n",

             row_start, row_end);

     tprintf("###############################################\n");

   }

   RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);


   GeometricClassifierState s(debug_level, rows, row_start, row_end);

   if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {

     s.Fail(2, "Too much variety for simple outline classification.");

     return;

   }

   if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {

     s.Fail(1, "Not enough variety for simple outline classification.");

     return;

   }

   if (s.left_tabs.size() + s.right_tabs.size() == 3) {

     GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);

     return;

   }


   // At this point, we know that one side has at least two tab stops, and the

   // other side has one or two tab stops.

   // Left to determine:

   //   (1) Which is the body indent and which is the first line indent?

   //   (2) Is the text fully justified?


   // If one side happens to have three or more tab stops, assume that side

   // is opposite of the aligned side.

   if (s.right_tabs.size() > 2) {

     s.AssumeLeftJustification();

   } else if (s.left_tabs.size() > 2) {

     s.AssumeRightJustification();

   } else if (s.ltr) {  // guess based on script direction

     s.AssumeLeftJustification();

   } else {

     s.AssumeRightJustification();

   }


   if (s.AlignTabs().size() == 2) {

     // For each tab stop on the aligned side, how many of them appear

     // to be paragraph start lines?  [first lines]

     int firsts[2] = {0, 0};

     // Count the first line as a likely paragraph start line.

     firsts[s.AlignsideTabIndex(s.row_start)]++;

     // For each line, if the first word would have fit on the previous

     // line count it as a likely paragraph start line.

     bool jam_packed = true;

     for (int i = s.row_start + 1; i < s.row_end; i++) {

       if (s.FirstWordWouldHaveFit(i - 1, i)) {

         firsts[s.AlignsideTabIndex(i)]++;

         jam_packed = false;

       }

     }

     // Make an extra accounting for the last line of the paragraph just

     // in case it's the only short line in the block.  That is, take its

     // first word as typical and see if this looks like the *last* line

     // of a paragraph.  If so, mark the *other* indent as probably a first.

     if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {

       firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;

     }


     int percent0firsts, percent1firsts;

     percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;

     percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;


     // TODO(eger): Tune these constants if necessary.

     if ((percent0firsts < 20 && 30 < percent1firsts) ||

         percent0firsts + 30 < percent1firsts) {

       s.first_indent = s.AlignTabs()[1].center;

       s.body_indent = s.AlignTabs()[0].center;

     } else if ((percent1firsts < 20 && 30 < percent0firsts) ||

                percent1firsts + 30 < percent0firsts) {

       s.first_indent = s.AlignTabs()[0].center;

       s.body_indent = s.AlignTabs()[1].center;

     } else {

       // Ambiguous! Probably lineated (poetry)

       if (debug_level > 1) {

         tprintf("# Cannot determine %s indent likely to start paragraphs.\n",

                 s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");

         tprintf("# Indent of %d looks like a first line %d%% of the time.\n",

                 s.AlignTabs()[0].center, percent0firsts);

         tprintf("# Indent of %d looks like a first line %d%% of the time.\n",

                 s.AlignTabs()[1].center, percent1firsts);

         s.PrintRows();

       }

       return;

     }

   } else {

     // There's only one tab stop for the "aligned to" side.

     s.first_indent = s.body_indent = s.AlignTabs()[0].center;

   }


   // At this point, we have our model.

   const ParagraphModel *model = theory->AddModel(s.Model());


   // Now all we have to do is figure out if the text is fully justified or not.

   // eop_threshold: default to fully justified unless we see evidence below.

   //    See description on MarkRowsWithModel()

   s.eop_threshold =

       (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;

   // If the text is not fully justified, re-set the eop_threshold to 0.

   if (s.AlignTabs().size() == 2) {

     // Paragraphs with a paragraph-start indent.

     for (int i = s.row_start; i < s.row_end - 1; i++) {

       if (ValidFirstLine(s.rows, i + 1, model) &&

           !NearlyEqual(s.OffsideTabs()[0].center,

                        (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {

         // We found a non-end-of-paragraph short line: not fully justified.

         s.eop_threshold = 0;

         break;

       }

     }

   } else {

     // Paragraphs with no paragraph-start indent.

     for (int i = s.row_start; i < s.row_end - 1; i++) {

       if (!s.FirstWordWouldHaveFit(i, i + 1) &&

           !NearlyEqual(s.OffsideTabs()[0].center,

                        (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {

         // We found a non-end-of-paragraph short line: not fully justified.

         s.eop_threshold = 0;

         break;

       }

     }

   }

   MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);

 }


 // =============== Implementation of ParagraphTheory =====================


 const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) {

   for (int i = 0; i < models_->size(); i++) {

     if ((*models_)[i]->Comparable(model))

       return (*models_)[i];

   }

   ParagraphModel *m = new ParagraphModel(model);

   models_->push_back(m);

   models_we_added_.push_back_new(m);

   return m;

 }


 void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {

   for (int i = models_->size() - 1; i >= 0; i--) {

     ParagraphModel *m = (*models_)[i];

     if (!used_models.contains(m) && models_we_added_.contains(m)) {

       models_->remove(i);

       models_we_added_.remove(models_we_added_.get_index(m));

       delete m;

     }

   }

 }


 // Examine rows[start, end) and try to determine if an existing non-centered

 // paragraph model would fit them perfectly.  If so, return a pointer to it.

 // If not, return NULL.

 const ParagraphModel *ParagraphTheory::Fits(

     const GenericVector<RowScratchRegisters> *rows, int start, int end) const {

   for (int m = 0; m < models_->size(); m++) {

     const ParagraphModel *model = (*models_)[m];

     if (model->justification() != JUSTIFICATION_CENTER &&

         RowsFitModel(rows, start, end, model))

       return model;

   }

   return NULL;

 }


 void ParagraphTheory::NonCenteredModels(SetOfModels *models) {

   for (int m = 0; m < models_->size(); m++) {

     const ParagraphModel *model = (*models_)[m];

     if (model->justification() != JUSTIFICATION_CENTER)

       models->push_back_new(model);

   }

 }


 int ParagraphTheory::IndexOf(const ParagraphModel *model) const {

   for (int i = 0; i < models_->size(); i++) {

     if ((*models_)[i] == model)

       return i;

   }

   return -1;

 }


 bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,

                     int row, const ParagraphModel *model) {

   if (!StrongModel(model)) {

     tprintf("ValidFirstLine() should only be called with strong models!\n");

   }

   return StrongModel(model) &&

       model->ValidFirstLine(

           (*rows)[row].lmargin_, (*rows)[row].lindent_,

           (*rows)[row].rindent_, (*rows)[row].rmargin_);

 }


 bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,

                    int row, const ParagraphModel *model) {

   if (!StrongModel(model)) {

     tprintf("ValidBodyLine() should only be called with strong models!\n");

   }

   return StrongModel(model) &&

       model->ValidBodyLine(

           (*rows)[row].lmargin_, (*rows)[row].lindent_,

           (*rows)[row].rindent_, (*rows)[row].rmargin_);

 }


 bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,

                      int a, int b, const ParagraphModel *model) {

   if (model != kCrownRight && model != kCrownLeft) {

     tprintf("CrownCompatible() should only be called with crown models!\n");

     return false;

   }

   RowScratchRegisters &row_a = (*rows)[a];

   RowScratchRegisters &row_b = (*rows)[b];

   if (model == kCrownRight) {

     return NearlyEqual(row_a.rindent_ + row_a.rmargin_,

                        row_b.rindent_ + row_b.rmargin_,

                        Epsilon(row_a.ri_->average_interword_space));

   }

   return NearlyEqual(row_a.lindent_ + row_a.lmargin_,

                      row_b.lindent_ + row_b.lmargin_,

                      Epsilon(row_a.ri_->average_interword_space));

 }


 // =============== Implementation of ParagraphModelSmearer ====================


 ParagraphModelSmearer::ParagraphModelSmearer(

     GenericVector<RowScratchRegisters> *rows,

     int row_start, int row_end, ParagraphTheory *theory)

         : theory_(theory), rows_(rows), row_start_(row_start),

           row_end_(row_end) {

   if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {

     row_start_ = 0;

     row_end_ = 0;

     return;

   }

   SetOfModels no_models;

   for (int row = row_start - 1; row <= row_end; row++) {

     open_models_.push_back(no_models);

   }

 }


 // see paragraphs_internal.h

 void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {

   SetOfModels no_models;

   if (row_start < row_start_) row_start = row_start_;

   if (row_end > row_end_) row_end = row_end_;


   for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;

        row++) {

     if ((*rows_)[row].ri_->num_words == 0) {

       OpenModels(row + 1) = no_models;

     } else {

       SetOfModels &opened = OpenModels(row);

       (*rows_)[row].StartHypotheses(&opened);


       // Which models survive the transition from row to row + 1?

       SetOfModels still_open;

       for (int m = 0; m < opened.size(); m++) {

         if (ValidFirstLine(rows_, row, opened[m]) ||

             ValidBodyLine(rows_, row, opened[m])) {

           // This is basic filtering; we check likely paragraph starty-ness down

           // below in Smear() -- you know, whether the first word would have fit

           // and such.

           still_open.push_back_new(opened[m]);

         }

       }

       OpenModels(row + 1) = still_open;

     }

   }

 }


 // see paragraphs_internal.h

 void ParagraphModelSmearer::Smear() {

   CalculateOpenModels(row_start_, row_end_);


   // For each row which we're unsure about (that is, it is LT_UNKNOWN or

   // we have multiple LT_START hypotheses), see if there's a model that

   // was recently used (an "open" model) which might model it well.

   for (int i = row_start_; i < row_end_; i++) {

     RowScratchRegisters &row = (*rows_)[i];

     if (row.ri_->num_words == 0)

       continue;


     // Step One:

     //   Figure out if there are "open" models which are left-alined or

     //   right-aligned.  This is important for determining whether the

     //   "first" word in a row would fit at the "end" of the previous row.

     bool left_align_open = false;

     bool right_align_open = false;

     for (int m = 0; m < OpenModels(i).size(); m++) {

       switch (OpenModels(i)[m]->justification()) {

         case JUSTIFICATION_LEFT: left_align_open = true; break;

         case JUSTIFICATION_RIGHT: right_align_open = true; break;

         default: left_align_open = right_align_open = true;

       }

     }

     // Step Two:

     //   Use that knowledge to figure out if this row is likely to

     //   start a paragraph.

     bool likely_start;

     if (i == 0) {

       likely_start = true;

     } else {

       if ((left_align_open && right_align_open) ||

           (!left_align_open && !right_align_open)) {

         likely_start = LikelyParagraphStart((*rows_)[i - 1], row,

                                             JUSTIFICATION_LEFT) ||

                        LikelyParagraphStart((*rows_)[i - 1], row,

                                             JUSTIFICATION_RIGHT);

       } else if (left_align_open) {

         likely_start = LikelyParagraphStart((*rows_)[i - 1], row,

                                             JUSTIFICATION_LEFT);

       } else {

         likely_start = LikelyParagraphStart((*rows_)[i - 1], row,

                                             JUSTIFICATION_RIGHT);

       }

     }


     // Step Three:

     //   If this text line seems like an obvious first line of an

     //   open model, or an obvious continuation of an existing

     //   modelled paragraph, mark it up.

     if (likely_start) {

       // Add Start Hypotheses for all Open models that fit.

       for (int m = 0; m < OpenModels(i).size(); m++) {

         if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {

           row.AddStartLine(OpenModels(i)[m]);

         }

       }

     } else {

       // Add relevant body line hypotheses.

       SetOfModels last_line_models;

       if (i > 0) {

         (*rows_)[i - 1].StrongHypotheses(&last_line_models);

       } else {

         theory_->NonCenteredModels(&last_line_models);

       }

       for (int m = 0; m < last_line_models.size(); m++) {

         const ParagraphModel *model = last_line_models[m];

         if (ValidBodyLine(rows_, i, model))

           row.AddBodyLine(model);

       }

     }


     // Step Four:

     //   If we're still quite unsure about this line, go through all

     //   models in our theory and see if this row could be the start

     //   of any of our  models.

     if (row.GetLineType() == LT_UNKNOWN ||

         (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {

       SetOfModels all_models;

       theory_->NonCenteredModels(&all_models);

       for (int m = 0; m < all_models.size(); m++) {

         if (ValidFirstLine(rows_, i, all_models[m])) {

           row.AddStartLine(all_models[m]);

         }

       }

     }

     // Step Five:

     //   Since we may have updated the hypotheses about this row, we need

     //   to recalculate the Open models for the rest of rows[i + 1, row_end)

     if (row.GetLineType() != LT_UNKNOWN) {

       CalculateOpenModels(i + 1, row_end_);

     }

   }

 }


 // ================ Main Paragraph Detection Algorithm =======================


 // Find out what ParagraphModels are actually used, and discard any

 // that are not.

 void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,

                          ParagraphTheory *theory) {

   SetOfModels used_models;

   for (int i = 0; i < rows.size(); i++) {

     rows[i].StrongHypotheses(&used_models);

   }

   theory->DiscardUnusedModels(used_models);

 }


 // DowngradeWeakestToCrowns:

 //   Forget any flush-{left, right} models unless we see two or more

 //   of them in sequence.

 //

 // In pass 3, we start to classify even flush-left paragraphs (paragraphs

 // where the first line and body indent are the same) as having proper Models.

 // This is generally dangerous, since if you start imagining that flush-left

 // is a typical paragraph model when it is not, it will lead you to chop normal

 // indented paragraphs in the middle whenever a sentence happens to start on a

 // new line (see "This" above).  What to do?

 //   What we do is to take any paragraph which is flush left and is not

 // preceded by another paragraph of the same model and convert it to a "Crown"

 // paragraph.  This is a weak pseudo-ParagraphModel which is a placeholder

 // for later.  It means that the paragraph is flush, but it would be desirable

 // to mark it as the same model as following text if it fits.  This downgrade

 // FlushLeft -> CrownLeft -> Model of following paragraph.  Means that we

 // avoid making flush left Paragraph Models whenever we see a top-of-the-page

 // half-of-a-paragraph. and instead we mark it the same as normal body text.

 //

 // Implementation:

 //

 //   Comb backwards through the row scratch registers, and turn any

 //   sequences of body lines of equivalent type abutted against the beginning

 //   or a body or start line of a different type into a crown paragraph.

 void DowngradeWeakestToCrowns(int debug_level,

                               ParagraphTheory *theory,

                               GenericVector<RowScratchRegisters> *rows) {

   int start;

   for (int end = rows->size(); end > 0; end = start) {

     // Search back for a body line of a unique type.

     const ParagraphModel *model = NULL;

     while (end > 0 &&

            (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) {

       end--;

     }

     if (end == 0) break;

     start = end - 1;

     while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {

       start--;  // walk back to the first line that is not the same body type.

     }

     if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&

         StrongModel(model) &&

         NearlyEqual(model->first_indent(), model->body_indent(),

                     model->tolerance())) {

         start--;

     }

     start++;

     // Now rows[start, end) is a sequence of unique body hypotheses of model.

     if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)

       continue;

     if (!StrongModel(model)) {

       while (start > 0 &&

              CrownCompatible(rows, start - 1, start, model))

         start--;

     }

     if (start == 0 ||

         (!StrongModel(model)) ||

         (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {

       // crownify rows[start, end)

       const ParagraphModel *crown_model = model;

       if (StrongModel(model)) {

           if (model->justification() == JUSTIFICATION_LEFT)

             crown_model = kCrownLeft;

           else

             crown_model = kCrownRight;

       }

       (*rows)[start].SetUnknown();

       (*rows)[start].AddStartLine(crown_model);

       for (int row = start + 1; row < end; row++) {

         (*rows)[row].SetUnknown();

         (*rows)[row].AddBodyLine(crown_model);

       }

     }

   }

   DiscardUnusedModels(*rows, theory);

 }


 // Clear all hypotheses about lines [start, end) and reset margins.

 //

 // The empty space between the left of a row and the block boundary (and

 // similarly for the right) is split into two pieces: margin and indent.

 // In initial processing, we assume the block is tight and the margin for

 // all lines is set to zero.   However, if our first pass does not yield

 // models for  everything,  it may be  due to an  inset paragraph like a

 // block-quote.   In that case, we make a second pass over that unmarked

 // section of the page and reset the "margin" portion of the empty space

 // to the common amount of space at  the ends of the lines under consid-

 // eration.    This would be equivalent to percentile set to 0. However,

 // sometimes we have a single character sticking out in the right margin

 // of a text block  (like the 'r' in 'for' on line 3 above),  and we can

 // really  just ignore it as an outlier.   To express this, we allow the

 // user to specify  the percentile (0..100)  of indent values  to use as

 // the common margin for each row in the run of rows[start, end).

 void RecomputeMarginsAndClearHypotheses(

     GenericVector<RowScratchRegisters> *rows, int start, int end,

     int percentile) {

   if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))

     return;


   int lmin, lmax, rmin, rmax;

   lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;

   rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;

   for (int i = start; i < end; i++) {

     RowScratchRegisters &sr = (*rows)[i];

     sr.SetUnknown();

     if (sr.ri_->num_words == 0)

       continue;

     UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);

     UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);

   }

   STATS lefts(lmin, lmax + 1);

   STATS rights(rmin, rmax + 1);

   for (int i = start; i < end; i++) {

     RowScratchRegisters &sr = (*rows)[i];

     if (sr.ri_->num_words == 0)

       continue;

     lefts.add(sr.lmargin_ + sr.lindent_, 1);

     rights.add(sr.rmargin_ + sr.rindent_, 1);

   }

   int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);

   int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);

   for (int i = start; i < end; i++) {

     RowScratchRegisters &sr = (*rows)[i];

     int ldelta = ignorable_left - sr.lmargin_;

     sr.lmargin_ += ldelta;

     sr.lindent_ -= ldelta;

     int rdelta = ignorable_right - sr.rmargin_;

     sr.rmargin_ += rdelta;

     sr.rindent_ -= rdelta;

   }

 }


 // Return the median inter-word space in rows[row_start, row_end).

 int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,

                    int row_start, int row_end) {

   if (row_end < row_start + 1) return 1;

   int word_height = (rows[row_start].ri_->lword_box.height() +

                      rows[row_end - 1].ri_->lword_box.height()) / 2;

   int word_width = (rows[row_start].ri_->lword_box.width() +

                     rows[row_end - 1].ri_->lword_box.width())  / 2;

   STATS spacing_widths(0, 5 + word_width);

   for (int i = row_start; i < row_end; i++) {

     if (rows[i].ri_->num_words > 1) {

       spacing_widths.add(rows[i].ri_->average_interword_space, 1);

     }

   }

   int minimum_reasonable_space = word_height / 3;

   if (minimum_reasonable_space < 2)

     minimum_reasonable_space = 2;

   int median = spacing_widths.median();

   return (median > minimum_reasonable_space)

       ? median : minimum_reasonable_space;

 }


 // Return whether the first word on the after line can fit in the space at

 // the end of the before line (knowing which way the text is aligned and read).

 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,

                            const RowScratchRegisters &after,

                            tesseract::ParagraphJustification justification) {

   if (before.ri_->num_words == 0 || after.ri_->num_words == 0)

     return true;


   if (justification == JUSTIFICATION_UNKNOWN) {

     tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");

   }

   int available_space;

   if (justification == JUSTIFICATION_CENTER) {

     available_space = before.lindent_ + before.rindent_;

   } else {

     available_space = before.OffsideIndent(justification);

   }

   available_space -= before.ri_->average_interword_space;


   if (before.ri_->ltr)

     return after.ri_->lword_box.width() < available_space;

   return after.ri_->rword_box.width() < available_space;

 }


 // Return whether the first word on the after line can fit in the space at

 // the end of the before line (not knowing which way the text goes) in a left

 // or right alignemnt.

 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,

                            const RowScratchRegisters &after) {

   if (before.ri_->num_words == 0 || after.ri_->num_words == 0)

     return true;


   int available_space = before.lindent_;

   if (before.rindent_ > available_space)

     available_space = before.rindent_;

   available_space -= before.ri_->average_interword_space;


   if (before.ri_->ltr)

     return after.ri_->lword_box.width() < available_space;

   return after.ri_->rword_box.width() < available_space;

 }


 bool TextSupportsBreak(const RowScratchRegisters &before,

                        const RowScratchRegisters &after) {

   if (before.ri_->ltr) {

     return before.ri_->rword_likely_ends_idea &&

            after.ri_->lword_likely_starts_idea;

   } else {

     return before.ri_->lword_likely_ends_idea &&

            after.ri_->rword_likely_starts_idea;

   }

 }


 bool LikelyParagraphStart(const RowScratchRegisters &before,

                           const RowScratchRegisters &after) {

   return before.ri_->num_words == 0 ||

       (FirstWordWouldHaveFit(before, after) &&

        TextSupportsBreak(before, after));

 }


 bool LikelyParagraphStart(const RowScratchRegisters &before,

                           const RowScratchRegisters &after,

                           tesseract::ParagraphJustification j) {

   return before.ri_->num_words == 0 ||

       (FirstWordWouldHaveFit(before, after, j) &&

        TextSupportsBreak(before, after));

 }


 // Examine rows[start, end) and try to determine what sort of ParagraphModel

 // would fit them as a single paragraph.

 // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.

 // If the rows given could be a consistent start to a paragraph, set *consistent

 // true.

 ParagraphModel InternalParagraphModelByOutline(

     const GenericVector<RowScratchRegisters> *rows,

     int start, int end, int tolerance, bool *consistent) {

   int ltr_line_count = 0;

   for (int i = start; i < end; i++) {

     ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);

   }

   bool ltr = (ltr_line_count >= (end - start) / 2);


   *consistent = true;

   if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))

     return ParagraphModel();


   // Ensure the caller only passed us a region with a common rmargin and

   // lmargin.

   int lmargin = (*rows)[start].lmargin_;

   int rmargin = (*rows)[start].rmargin_;

   int lmin, lmax, rmin, rmax, cmin, cmax;

   lmin = lmax = (*rows)[start + 1].lindent_;

   rmin = rmax = (*rows)[start + 1].rindent_;

   cmin = cmax = 0;

   for (int i = start + 1; i < end; i++) {

     if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {

       tprintf("Margins don't match! Software error.\n");

       *consistent = false;

       return ParagraphModel();

     }

     UpdateRange((*rows)[i].lindent_, &lmin, &lmax);

     UpdateRange((*rows)[i].rindent_, &rmin, &rmax);

     UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);

   }

   int ldiff = lmax - lmin;

   int rdiff = rmax - rmin;

   int cdiff = cmax - cmin;

   if (rdiff > tolerance && ldiff > tolerance) {

     if (cdiff < tolerance * 2) {

       if (end - start < 3)

         return ParagraphModel();

       return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);

     }

     *consistent = false;

     return ParagraphModel();

   }

   if (end - start < 3)  // Don't return a model for two line paras.

     return ParagraphModel();


   // These booleans keep us from saying something is aligned left when the body

   // left variance is too large.

   bool body_admits_left_alignment = ldiff < tolerance;

   bool body_admits_right_alignment = rdiff < tolerance;


   ParagraphModel left_model =

       ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,

                      (lmin + lmax) / 2, tolerance);

   ParagraphModel right_model =

       ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,

                      (rmin + rmax) / 2, tolerance);


   // These booleans keep us from having an indent on the "wrong side" for the

   // first line.

   bool text_admits_left_alignment = ltr || left_model.is_flush();

   bool text_admits_right_alignment = !ltr || right_model.is_flush();


   // At least one of the edges is less than tolerance in variance.

   // If the other is obviously ragged, it can't be the one aligned to.

   // [Note the last line is included in this raggedness.]

   if (tolerance < rdiff) {

     if (body_admits_left_alignment && text_admits_left_alignment)

       return left_model;

     *consistent = false;

     return ParagraphModel();

   }

   if (tolerance < ldiff) {

     if (body_admits_right_alignment && text_admits_right_alignment)

       return right_model;

     *consistent = false;

     return ParagraphModel();

   }


   // At this point, we know the body text doesn't vary much on either side.


   // If the first line juts out oddly in one direction or the other,

   // that likely indicates the side aligned to.

   int first_left = (*rows)[start].lindent_;

   int first_right = (*rows)[start].rindent_;


   if (ltr && body_admits_left_alignment &&

       (first_left < lmin || first_left > lmax))

     return left_model;

   if (!ltr && body_admits_right_alignment &&

       (first_right < rmin || first_right > rmax))

     return right_model;


   *consistent = false;

   return ParagraphModel();

 }


 // Examine rows[start, end) and try to determine what sort of ParagraphModel

 // would fit them as a single paragraph.   If nothing fits,

 // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug

 // output if we're debugging.

 ParagraphModel ParagraphModelByOutline(

     int debug_level,

     const GenericVector<RowScratchRegisters> *rows,

     int start, int end, int tolerance) {

   bool unused_consistent;

   ParagraphModel retval = InternalParagraphModelByOutline(

       rows, start, end, tolerance, &unused_consistent);

   if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {

     tprintf("Could not determine a model for this paragraph:\n");

     PrintRowRange(*rows, start, end);

   }

   return retval;

 }


 // Do rows[start, end) form a single instance of the given paragraph model?

 bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,

                   int start, int end, const ParagraphModel *model) {

   if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))

     return false;

   if (!ValidFirstLine(rows, start, model)) return false;

   for (int i = start + 1 ; i < end; i++) {

     if (!ValidBodyLine(rows, i, model)) return false;

   }

   return true;

 }


 // Examine rows[row_start, row_end) as an independent section of text,

 // and mark rows that are exceptionally clear as start-of-paragraph

 // and paragraph-body lines.

 //

 // We presume that any lines surrounding rows[row_start, row_end) may

 // have wildly different paragraph models, so we don't key any data off

 // of those lines.

 //

 // We only take the very strongest signals, as we don't want to get

 // confused and marking up centered text, poetry, or source code as

 // clearly part of a typical paragraph.

 void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows,

                         int row_start, int row_end) {

   // Record patently obvious body text.

   for (int i = row_start + 1; i < row_end; i++) {

     const RowScratchRegisters &prev = (*rows)[i - 1];

     RowScratchRegisters &curr = (*rows)[i];

     tesseract::ParagraphJustification typical_justification =

         prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;

     if (!curr.ri_->rword_likely_starts_idea &&

         !curr.ri_->lword_likely_starts_idea &&

         !FirstWordWouldHaveFit(prev, curr, typical_justification)) {

       curr.SetBodyLine();

     }

   }


   // Record patently obvious start paragraph lines.

   //

   // It's an extremely good signal of the start of a paragraph that

   // the first word would have fit on the end of the previous line.

   // However, applying just that signal would have us mark random

   // start lines of lineated text (poetry and source code) and some

   // centered headings as paragraph start lines.  Therefore, we use

   // a second qualification for a paragraph start: Not only should

   // the first word of this line have fit on the previous line,

   // but also, this line should go full to the right of the block,

   // disallowing a subsequent word from having fit on this line.


   // First row:

   {

     RowScratchRegisters &curr = (*rows)[row_start];

     RowScratchRegisters &next = (*rows)[row_start + 1];

     tesseract::ParagraphJustification j =

         curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;

     if (curr.GetLineType() == LT_UNKNOWN &&

         !FirstWordWouldHaveFit(curr, next, j) &&

         (curr.ri_->lword_likely_starts_idea ||

          curr.ri_->rword_likely_starts_idea)) {

       curr.SetStartLine();

     }

   }

   // Middle rows

   for (int i = row_start + 1; i < row_end - 1; i++) {

     RowScratchRegisters &prev = (*rows)[i - 1];

     RowScratchRegisters &curr = (*rows)[i];

     RowScratchRegisters &next = (*rows)[i + 1];

     tesseract::ParagraphJustification j =

         curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;

     if (curr.GetLineType() == LT_UNKNOWN &&

         !FirstWordWouldHaveFit(curr, next, j) &&

         LikelyParagraphStart(prev, curr, j)) {

       curr.SetStartLine();

     }

   }

   // Last row

   {  // the short circuit at the top means we have at least two lines.

     RowScratchRegisters &prev = (*rows)[row_end - 2];

     RowScratchRegisters &curr = (*rows)[row_end - 1];

     tesseract::ParagraphJustification j =

         curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;

     if (curr.GetLineType() == LT_UNKNOWN &&

         !FirstWordWouldHaveFit(curr, curr, j) &&

         LikelyParagraphStart(prev, curr, j)) {

       curr.SetStartLine();

     }

   }

 }


 // Look for sequences of a start line followed by some body lines in

 // rows[row_start, row_end) and create ParagraphModels for them if

 // they seem coherent.

 void ModelStrongEvidence(int debug_level,

                          GenericVector<RowScratchRegisters> *rows,

                          int row_start, int row_end,

                          bool allow_flush_models,

                          ParagraphTheory *theory) {

   if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))

     return;


   int start = row_start;

   while (start < row_end) {

     while (start < row_end && (*rows)[start].GetLineType() != LT_START)

       start++;

     if (start >= row_end - 1)

       break;


     int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);

     int end = start;

     ParagraphModel last_model;

     bool next_consistent;

     do {

       ++end;

       // rows[row, end) was consistent.

       // If rows[row, end + 1) is not consistent,

       //   just model rows[row, end)

       if (end < row_end - 1) {

         RowScratchRegisters &next = (*rows)[end];

         LineType lt = next.GetLineType();

         next_consistent = lt == LT_BODY ||

             (lt == LT_UNKNOWN &&

              !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));

       } else {

         next_consistent = false;

       }

       if (next_consistent) {

         ParagraphModel next_model = InternalParagraphModelByOutline(

             rows, start, end + 1, tolerance, &next_consistent);

         if (((*rows)[start].ri_->ltr &&

              last_model.justification() == JUSTIFICATION_LEFT &&

              next_model.justification() != JUSTIFICATION_LEFT) ||

             (!(*rows)[start].ri_->ltr &&

              last_model.justification() == JUSTIFICATION_RIGHT &&

              next_model.justification() != JUSTIFICATION_RIGHT)) {

           next_consistent = false;

         }

         last_model = next_model;

       } else {

         next_consistent = false;

       }

     } while (next_consistent && end < row_end);

     // At this point, rows[start, end) looked like it could have been a

     // single paragraph.  If we can make a good ParagraphModel for it,

     // do so and mark this sequence with that model.

     if (end > start + 1) {

       // emit a new paragraph if we have more than one line.

       const ParagraphModel *model = NULL;

       ParagraphModel new_model = ParagraphModelByOutline(

           debug_level, rows, start, end,

           Epsilon(InterwordSpace(*rows, start, end)));

       if (new_model.justification() == JUSTIFICATION_UNKNOWN) {

         // couldn't create a good model, oh well.

       } else if (new_model.is_flush()) {

         if (end == start + 2) {

           // It's very likely we just got two paragraph starts in a row.

           end = start + 1;

         } else if (start == row_start) {

           // Mark this as a Crown.

           if (new_model.justification() == JUSTIFICATION_LEFT) {

             model = kCrownLeft;

           } else {

             model = kCrownRight;

           }

         } else if (allow_flush_models) {

           model = theory->AddModel(new_model);

         }

       } else {

         model = theory->AddModel(new_model);

       }

       if (model) {

         (*rows)[start].AddStartLine(model);

         for (int i = start + 1; i < end; i++) {

           (*rows)[i].AddBodyLine(model);

         }

       }

     }

     start = end;

   }

 }


 // We examine rows[row_start, row_end) and do the following:

 //   (1) Clear all existing hypotheses for the rows being considered.

 //   (2) Mark up any rows as exceptionally likely to be paragraph starts

 //       or paragraph body lines as such using both geometric and textual

 //       clues.

 //   (3) Form models for any sequence of start + continuation lines.

 //   (4) Smear the paragraph models to cover surrounding text.

 void StrongEvidenceClassify(int debug_level,

                             GenericVector<RowScratchRegisters> *rows,

                             int row_start, int row_end,

                             ParagraphTheory *theory) {

   if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))

     return;


   if (debug_level > 1) {

     tprintf("#############################################\n");

     tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);

     tprintf("#############################################\n");

   }


   RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);

   MarkStrongEvidence(rows, row_start, row_end);


   DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);


   // Create paragraph models.

   ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);


   DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);


   // At this point, some rows are marked up as paragraphs with model numbers,

   // and some rows are marked up as either LT_START or LT_BODY.  Now let's

   // smear any good paragraph hypotheses forward and backward.

   ParagraphModelSmearer smearer(rows, row_start, row_end, theory);

   smearer.Smear();

 }


 void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,

                                int row_start, int row_end,

                                ParagraphTheory *theory) {

   for (int i = row_start + 1; i < row_end - 1; i++) {

     if ((*rows)[i - 1].ri_->has_leaders &&

         (*rows)[i].ri_->has_leaders &&

         (*rows)[i + 1].ri_->has_leaders) {

       const ParagraphModel *model = theory->AddModel(

           ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));

       (*rows)[i].AddStartLine(model);

     }

   }

 }


 // Collect sequences of unique hypotheses in row registers and create proper

 // paragraphs for them, referencing the paragraphs in row_owners.

 void ConvertHypothesizedModelRunsToParagraphs(

     int debug_level,

     const GenericVector<RowScratchRegisters> &rows,

     GenericVector<PARA *> *row_owners,

     ParagraphTheory *theory) {

   int end = rows.size();

   int start;

   for (; end > 0; end = start) {

     start = end - 1;

     const ParagraphModel *model = NULL;

     // TODO(eger): Be smarter about dealing with multiple hypotheses.

     bool single_line_paragraph = false;

     SetOfModels models;

     rows[start].NonNullHypotheses(&models);

     if (models.size() > 0) {

       model = models[0];

       if (rows[start].GetLineType(model) != LT_BODY)

         single_line_paragraph = true;

     }

     if (model && !single_line_paragraph) {

       // walk back looking for more body lines and then a start line.

       while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {

         // do nothing

       }

       if (start < 0 || rows[start].GetLineType(model) != LT_START) {

         model = NULL;

       }

     }

     if (model == NULL) {

       continue;

     }

     // rows[start, end) should be a paragraph.

     PARA *p = new PARA();

     if (model == kCrownLeft || model == kCrownRight) {

       p->is_very_first_or_continuation = true;

       // Crown paragraph.

       //   If we can find an existing ParagraphModel that fits, use it,

       //   else create a new one.

       for (int row = end; row < rows.size(); row++) {

         if ((*row_owners)[row] &&

             (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&

             (start == 0 ||

              ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {

           model = (*row_owners)[row]->model;

           break;

         }

       }

       if (model == kCrownLeft) {

         // No subsequent model fits, so cons one up.

         model = theory->AddModel(ParagraphModel(

             JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,

             0, 0, Epsilon(rows[start].ri_->average_interword_space)));

       } else if (model == kCrownRight) {

         // No subsequent model fits, so cons one up.

         model = theory->AddModel(ParagraphModel(

             JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,

             0, 0, Epsilon(rows[start].ri_->average_interword_space)));

       }

     }

     rows[start].SetUnknown();

     rows[start].AddStartLine(model);

     for (int i = start + 1; i < end; i++) {

       rows[i].SetUnknown();

       rows[i].AddBodyLine(model);

     }

     p->model = model;

     p->has_drop_cap = rows[start].ri_->has_drop_cap;

     p->is_list_item =

         model->justification() == JUSTIFICATION_RIGHT

             ? rows[start].ri_->rword_indicates_list_item

             : rows[start].ri_->lword_indicates_list_item;

     for (int row = start; row < end; row++) {

       if ((*row_owners)[row] != NULL) {

         tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "

                 "more than once!\n");

       }

       (*row_owners)[row] = p;

     }

   }

 }


 struct Interval {

   Interval() : begin(0), end(0) {}

   Interval(int b, int e) : begin(b), end(e) {}


   int begin;

   int end;

 };


 // Return whether rows[row] appears to be stranded, meaning that the evidence

 // for this row is very weak due to context.  For instance, two lines of source

 // code may happen to be indented at the same tab vector as body text starts,

 // leading us to think they are two start-of-paragraph lines.  This is not

 // optimal.  However, we also don't want to mark a sequence of short dialog

 // as "weak," so our heuristic is:

 //   (1) If a line is surrounded by lines of unknown type, it's weak.

 //   (2) If two lines in a row are start lines for a given paragraph type, but

 //       after that the same paragraph type does not continue, they're weak.

 bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int row) {

   SetOfModels row_models;

   rows[row].StrongHypotheses(&row_models);


   for (int m = 0; m < row_models.size(); m++) {

     bool all_starts = rows[row].GetLineType();

     int run_length = 1;

     bool continues = true;

     for (int i = row - 1; i >= 0 && continues; i--) {

       SetOfModels models;

       rows[i].NonNullHypotheses(&models);

       switch (rows[i].GetLineType(row_models[m])) {

         case LT_START: run_length++; break;

         case LT_MULTIPLE:  // explicit fall-through

         case LT_BODY: run_length++; all_starts = false; break;

         case LT_UNKNOWN:  // explicit fall-through

         default: continues = false;

       }

     }

     continues = true;

     for (int i = row + 1; i < rows.size() && continues; i++) {

       SetOfModels models;

       rows[i].NonNullHypotheses(&models);

       switch (rows[i].GetLineType(row_models[m])) {

         case LT_START: run_length++; break;

         case LT_MULTIPLE:  // explicit fall-through

         case LT_BODY: run_length++; all_starts = false; break;

         case LT_UNKNOWN:  // explicit fall-through

         default: continues = false;

       }

     }

     if (run_length > 2 || (!all_starts && run_length > 1)) return false;

   }

   return true;

 }


 // Go through rows[row_start, row_end) and gather up sequences that need better

 // classification.

 // + Sequences of non-empty rows without hypotheses.

 // + Crown paragraphs not immediately followed by a strongly modeled line.

 // + Single line paragraphs surrounded by text that doesn't match the

 //   model.

 void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,

                       GenericVector<Interval> *to_fix,

                       int row_start, int row_end) {

   to_fix->clear();

   for (int i = row_start; i < row_end; i++) {

     bool needs_fixing = false;


     SetOfModels models;

     SetOfModels models_w_crowns;

     rows[i].StrongHypotheses(&models);

     rows[i].NonNullHypotheses(&models_w_crowns);

     if (models.empty() && models_w_crowns.size() > 0) {

       // Crown paragraph.  Is it followed by a modeled line?

       for (int end = i + 1; end < rows.size(); end++) {

         SetOfModels end_models;

         SetOfModels strong_end_models;

         rows[end].NonNullHypotheses(&end_models);

         rows[end].StrongHypotheses(&strong_end_models);

         if (end_models.size() == 0) {

           needs_fixing = true;

           break;

         } else if (strong_end_models.size() > 0) {

           needs_fixing = false;

           break;

         }

       }

     } else if (models.empty() && rows[i].ri_->num_words > 0) {

       // No models at all.

       needs_fixing = true;

     }


     if (!needs_fixing && !models.empty()) {

       needs_fixing = RowIsStranded(rows, i);

     }


     if (needs_fixing) {

       if (!to_fix->empty() && to_fix->back().end == i - 1)

         to_fix->back().end = i;

       else

         to_fix->push_back(Interval(i, i));

     }

   }

   // Convert inclusive intervals to half-open intervals.

   for (int i = 0; i < to_fix->size(); i++) {

     (*to_fix)[i].end = (*to_fix)[i].end + 1;

   }

 }


 // Given a set of row_owners pointing to PARAs or NULL (no paragraph known),

 // normalize each row_owner to point to an actual PARA, and output the

 // paragraphs in order onto paragraphs.

 void CanonicalizeDetectionResults(

     GenericVector<PARA *> *row_owners,

     PARA_LIST *paragraphs) {

   GenericVector<PARA *> &rows = *row_owners;

   paragraphs->clear();

   PARA_IT out(paragraphs);

   PARA *formerly_null = NULL;

   for (int i = 0; i < rows.size(); i++) {

     if (rows[i] == NULL) {

       if (i == 0 || rows[i - 1] != formerly_null) {

         rows[i] = formerly_null = new PARA();

       } else {

         rows[i] = formerly_null;

         continue;

       }

     } else if (i > 0 && rows[i - 1] == rows[i]) {

       continue;

     }

     out.add_after_then_move(rows[i]);

   }

 }


 // Main entry point for Paragraph Detection Algorithm.

 //

 // Given a set of equally spaced textlines (described by row_infos),

 // Split them into paragraphs.

 //

 // Output:

 //   row_owners - one pointer for each row, to the paragraph it belongs to.

 //   paragraphs - this is the actual list of PARA objects.

 //   models - the list of paragraph models referenced by the PARA objects.

 //            caller is responsible for deleting the models.

 void DetectParagraphs(int debug_level,

                       GenericVector<RowInfo> *row_infos,

                       GenericVector<PARA *> *row_owners,

                       PARA_LIST *paragraphs,

                       GenericVector<ParagraphModel *> *models) {

   GenericVector<RowScratchRegisters> rows;

   ParagraphTheory theory(models);


   // Initialize row_owners to be a bunch of NULL pointers.

   row_owners->init_to_size(row_infos->size(), NULL);


   // Set up row scratch registers for the main algorithm.

   rows.init_to_size(row_infos->size(), RowScratchRegisters());

   for (int i = 0; i < row_infos->size(); i++) {

     rows[i].Init((*row_infos)[i]);

   }


   // Pass 1:

   //   Detect sequences of lines that all contain leader dots (.....)

   //   These are likely Tables of Contents.  If there are three text lines in

   //   a row with leader dots, it's pretty safe to say the middle one should

   //   be a paragraph of its own.

   SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);


   DebugDump(debug_level > 1, "End of Pass 1", theory, rows);


   GenericVector<Interval> leftovers;

   LeftoverSegments(rows, &leftovers, 0, rows.size());

   for (int i = 0; i < leftovers.size(); i++) {

     // Pass 2a:

     //   Find any strongly evidenced start-of-paragraph lines.  If they're

     //   followed by two lines that look like body lines, make a paragraph

     //   model for that and see if that model applies throughout the text

     //   (that is, "smear" it).

     StrongEvidenceClassify(debug_level, &rows,

                            leftovers[i].begin, leftovers[i].end, &theory);


     // Pass 2b:

     //   If we had any luck in pass 2a, we got part of the page and didn't

     //   know how to classify a few runs of rows. Take the segments that

     //   didn't find a model and reprocess them individually.

     GenericVector<Interval> leftovers2;

     LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);

     bool pass2a_was_useful = leftovers2.size() > 1 ||

         (leftovers2.size() == 1 &&

          (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));

     if (pass2a_was_useful) {

       for (int j = 0; j < leftovers2.size(); j++) {

         StrongEvidenceClassify(debug_level, &rows,

                                leftovers2[j].begin, leftovers2[j].end,

                                &theory);

       }

     }

   }


   DebugDump(debug_level > 1, "End of Pass 2", theory, rows);


   // Pass 3:

   //   These are the dregs for which we didn't have enough strong textual

   //   and geometric clues to form matching models for.  Let's see if

   //   the geometric clues are simple enough that we could just use those.

   LeftoverSegments(rows, &leftovers, 0, rows.size());

   for (int i = 0; i < leftovers.size(); i++) {

     GeometricClassify(debug_level, &rows,

                       leftovers[i].begin, leftovers[i].end, &theory);

   }


   // Undo any flush models for which there's little evidence.

   DowngradeWeakestToCrowns(debug_level, &theory, &rows);


   DebugDump(debug_level > 1, "End of Pass 3", theory, rows);


   // Pass 4:

   //   Take everything that's still not marked up well and clear all markings.

   LeftoverSegments(rows, &leftovers, 0, rows.size());

   for (int i = 0; i < leftovers.size(); i++) {

     for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {

       rows[j].SetUnknown();

     }

   }


   DebugDump(debug_level > 1, "End of Pass 4", theory, rows);


   // Convert all of the unique hypothesis runs to PARAs.

   ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,

                                            &theory);


   DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);


   // Finally, clean up any dangling NULL row paragraph parents.

   CanonicalizeDetectionResults(row_owners, paragraphs);

 }


 // ============ Code interfacing with the rest of Tesseract ==================


 void InitializeTextAndBoxesPreRecognition(const MutableIterator &it,

                                           RowInfo *info) {

   // Set up text, lword_text, and rword_text (mostly for debug printing).

   STRING fake_text;

   PageIterator pit(static_cast<const PageIterator&>(it));

   bool first_word = true;

   if (!pit.Empty(RIL_WORD)) {

     do {

       fake_text += "x";

       if (first_word) info->lword_text += "x";

       info->rword_text += "x";

       if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&

           !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {

         fake_text += " ";

         info->rword_text = "";

         first_word = false;

       }

     } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&

              pit.Next(RIL_SYMBOL));

   }

   if (fake_text.size() == 0) return;


   int lspaces = info->pix_ldistance / info->average_interword_space;

   for (int i = 0; i < lspaces; i++) {

     info->text += ' ';

   }

   info->text += fake_text;


   // Set up lword_box, rword_box, and num_words.

   PAGE_RES_IT page_res_it = *it.PageResIt();

   WERD_RES *word_res = page_res_it.restart_row();

   ROW_RES *this_row = page_res_it.row();


   WERD_RES *lword = NULL;

   WERD_RES *rword = NULL;

   info->num_words = 0;

   do {

     if (word_res) {

       if (!lword) lword = word_res;

       if (rword != word_res) info->num_words++;

       rword = word_res;

     }

     word_res = page_res_it.forward();

   } while (page_res_it.row() == this_row);


   if (lword) info->lword_box = lword->word->bounding_box();

   if (rword) info->rword_box = rword->word->bounding_box();

 }


 // Given a Tesseract Iterator pointing to a text line, fill in the paragraph

 // detector RowInfo with all relevant information from the row.

 void InitializeRowInfo(bool after_recognition,

                        const MutableIterator &it,

                        RowInfo *info) {

   if (it.PageResIt()->row() != NULL) {

     ROW *row = it.PageResIt()->row()->row;

     info->pix_ldistance = row->lmargin();

     info->pix_rdistance = row->rmargin();

     info->average_interword_space =

         row->space() > 0 ? row->space() : MAX(row->x_height(), 1);

     info->pix_xheight = row->x_height();

     info->has_leaders = false;

     info->has_drop_cap = row->has_drop_cap();

     info->ltr = true;  // set below depending on word scripts

   } else {

     info->pix_ldistance = info->pix_rdistance = 0;

     info->average_interword_space = 1;

     info->pix_xheight = 1.0;

     info->has_leaders = false;

     info->has_drop_cap = false;

     info->ltr = true;

   }


   info->num_words = 0;

   info->lword_indicates_list_item = false;

   info->lword_likely_starts_idea = false;

   info->lword_likely_ends_idea = false;

   info->rword_indicates_list_item = false;

   info->rword_likely_starts_idea = false;

   info->rword_likely_ends_idea = false;

   info->has_leaders = false;

   info->ltr = 1;


   if (!after_recognition) {

     InitializeTextAndBoxesPreRecognition(it, info);

     return;

   }

   info->text = "";

   char *text = it.GetUTF8Text(RIL_TEXTLINE);

   int trailing_ws_idx = strlen(text);  // strip trailing space

   while (trailing_ws_idx > 0 &&

          // isspace() only takes ASCII

          ((text[trailing_ws_idx - 1] & 0x80) == 0) &&

          isspace(text[trailing_ws_idx - 1]))

     trailing_ws_idx--;

   if (trailing_ws_idx > 0) {

     int lspaces = info->pix_ldistance / info->average_interword_space;

     for (int i = 0; i < lspaces; i++)

       info->text += ' ';

     for (int i = 0; i < trailing_ws_idx; i++)

       info->text += text[i];

   }

   delete []text;


   if (info->text.size() == 0) {

     return;

   }


   PAGE_RES_IT page_res_it = *it.PageResIt();

   GenericVector<WERD_RES *> werds;

   WERD_RES *word_res = page_res_it.restart_row();

   ROW_RES *this_row = page_res_it.row();

   int num_leaders = 0;

   int ltr = 0;

   int rtl = 0;

   do {

     if (word_res && word_res->best_choice->unichar_string().length() > 0) {

       werds.push_back(word_res);

       ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;

       rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;

       if (word_res->word->flag(W_REP_CHAR)) num_leaders++;

     }

     word_res = page_res_it.forward();

   } while (page_res_it.row() == this_row);

   info->ltr = ltr >= rtl;

   info->has_leaders = num_leaders > 3;

   info->num_words = werds.size();

   if (werds.size() > 0) {

     WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];

     info->lword_text = lword->best_choice->unichar_string().string();

     info->rword_text = rword->best_choice->unichar_string().string();

     info->lword_box = lword->word->bounding_box();

     info->rword_box = rword->word->bounding_box();

     LeftWordAttributes(lword->uch_set, lword->best_choice,

                        info->lword_text,

                        &info->lword_indicates_list_item,

                        &info->lword_likely_starts_idea,

                        &info->lword_likely_ends_idea);

     RightWordAttributes(rword->uch_set, rword->best_choice,

                         info->rword_text,

                         &info->rword_indicates_list_item,

                         &info->rword_likely_starts_idea,

                         &info->rword_likely_ends_idea);

   }

 }


 // This is called after rows have been identified and words are recognized.

 // Much of this could be implemented before word recognition, but text helps

 // to identify bulleted lists and gives good signals for sentence boundaries.

 void DetectParagraphs(int debug_level,

                       bool after_text_recognition,

                       const MutableIterator *block_start,

                       GenericVector<ParagraphModel *> *models) {

   // Clear out any preconceived notions.

   if (block_start->Empty(RIL_TEXTLINE)) {

     return;

   }

   BLOCK *block = block_start->PageResIt()->block()->block;

   block->para_list()->clear();

   bool is_image_block = block->poly_block() && !block->poly_block()->IsText();


   // Convert the Tesseract structures to RowInfos

   // for the paragraph detection algorithm.

   MutableIterator row(*block_start);

   if (row.Empty(RIL_TEXTLINE))

     return;  // end of input already.


   GenericVector<RowInfo> row_infos;

   do {

     if (!row.PageResIt()->row())

       continue;  // empty row.

     row.PageResIt()->row()->row->set_para(NULL);

     row_infos.push_back(RowInfo());

     RowInfo &ri = row_infos.back();

     InitializeRowInfo(after_text_recognition, row, &ri);

   } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&

            row.Next(RIL_TEXTLINE));


   // If we're called before text recognition, we might not have

   // tight block bounding boxes, so trim by the minimum on each side.

   if (row_infos.size() > 0) {

     int min_lmargin = row_infos[0].pix_ldistance;

     int min_rmargin = row_infos[0].pix_rdistance;

     for (int i = 1; i < row_infos.size(); i++) {

       if (row_infos[i].pix_ldistance < min_lmargin)

         min_lmargin = row_infos[i].pix_ldistance;

       if (row_infos[i].pix_rdistance < min_rmargin)

         min_rmargin = row_infos[i].pix_rdistance;

     }

     if (min_lmargin > 0 || min_rmargin > 0) {

       for (int i = 0; i < row_infos.size(); i++) {

         row_infos[i].pix_ldistance -= min_lmargin;

         row_infos[i].pix_rdistance -= min_rmargin;

       }

     }

   }


   // Run the paragraph detection algorithm.

   GenericVector<PARA *> row_owners;

   GenericVector<PARA *> the_paragraphs;

   if (!is_image_block) {

     DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),

                      models);

   } else {

     row_owners.init_to_size(row_infos.size(), NULL);

     CanonicalizeDetectionResults(&row_owners, block->para_list());

   }


   // Now stitch in the row_owners into the rows.

   row = *block_start;

   for (int i = 0; i < row_owners.size(); i++) {

     while (!row.PageResIt()->row())

       row.Next(RIL_TEXTLINE);

     row.PageResIt()->row()->row->set_para(row_owners[i]);

     row.Next(RIL_TEXTLINE);

   }

 }


 }  // namespace

tesseract::UnicodeSpanSkipper::SkipPunc
int SkipPunc(int pos)
Definition: paragraphs.cpp:302

tesseract::ParagraphModelSmearer
Definition: paragraphs_internal.h:234

tesseract::JUSTIFICATION_LEFT
Definition: publictypes.h:241

tesseract::Cluster
Definition: paragraphs.cpp:643

tesseract::InternalParagraphModelByOutline
ParagraphModel InternalParagraphModelByOutline(const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance, bool *consistent)
Definition: paragraphs.cpp:1692

tesseract::RowScratchRegisters::SetBodyLine
void SetBodyLine()
Definition: paragraphs.cpp:572

tesseract::FirstWordWouldHaveFit
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
Definition: paragraphs.cpp:1646

tesseract::RowInfo::rword_indicates_list_item
bool rword_indicates_list_item
Definition: paragraphs.h:75

tesseract::UnicodeSpanSkipper::SkipAlpha
int SkipAlpha(int pos)
Definition: paragraphs.cpp:323

GenericVector::size
int size() const
Definition: genericvector.h:72

tesseract::ValidFirstLine
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
Definition: paragraphs.cpp:1266

tesseract::IsOpeningPunct
bool IsOpeningPunct(int ch)
Definition: paragraphs.cpp:201

tesseract::RowInfo::rword_box
TBOX rword_box
Definition: paragraphs.h:56

tesseract::Cluster::center
int center
Definition: paragraphs.cpp:647

tesseract::GeometricClassifyThreeTabStopTextBlock
void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s, ParagraphTheory *theory)
Definition: paragraphs.cpp:985

ParagraphModel::body_indent
int body_indent() const
Definition: ocrpara.h:169

tesseract::InitializeTextAndBoxesPreRecognition
void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info)
Definition: paragraphs.cpp:2359

ratngs.h

tesseract::LineHypothesis
Definition: paragraphs_internal.h:79

tesseract::UnicodeSpanSkipper::SkipRomans
int SkipRomans(int pos)
Definition: paragraphs.cpp:313

WERD_RES
Definition: pageres.h:155

ROW::space
inT32 space() const
Definition: ocrrow.h:76

tesseract::RowScratchRegisters::lmargin_
int lmargin_
Definition: paragraphs_internal.h:177

tesseract::LikelyListNumeral
bool LikelyListNumeral(const STRING &word)
Definition: paragraphs.cpp:228

ParagraphModel::tolerance
int tolerance() const
Definition: ocrpara.h:170

tesseract::GeometricClassifierState::PrintRows
void PrintRows() const
Definition: paragraphs.cpp:915

MAX
#define MAX(x, y)
Definition: ndminx.h:24

tesseract::RowScratchRegisters::UniqueStartHypothesis
const ParagraphModel * UniqueStartHypothesis() const
Definition: paragraphs.cpp:617

unicodes.h

pageres.h

tesseract::Interval::begin
int begin
Definition: paragraphs.cpp:2126

tesseract::GeometricClassifierState::margin
int margin
Definition: paragraphs.cpp:950

tesseract::JUSTIFICATION_RIGHT
Definition: publictypes.h:243

tesseract::RowScratchRegisters
Definition: paragraphs_internal.h:100

tesseract::DiscardUnusedModels
void DiscardUnusedModels(const GenericVector< RowScratchRegisters > &rows, ParagraphTheory *theory)
Definition: paragraphs.cpp:1455

tesseract::RowsFitModel
bool RowsFitModel(const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
Definition: paragraphs.cpp:1808

WERD_CHOICE::length
int length() const
Definition: ratngs.h:300

WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:219

tesseract::SimpleClusterer::SimpleClusterer
SimpleClusterer(int max_cluster_width)
Definition: paragraphs.cpp:653

tesseract::LeftWordAttributes
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:394

publictypes.h

ROW_RES
Definition: pageres.h:125

GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:737

tesseract::CalculateTabStops
void CalculateTabStops(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, int tolerance, GenericVector< Cluster > *left_tabs, GenericVector< Cluster > *right_tabs)
Definition: paragraphs.cpp:691

tesseract::GeometricClassifierState::rows
GenericVector< RowScratchRegisters > * rows
Definition: paragraphs.cpp:932

tesseract::LT_UNKNOWN
Definition: paragraphs_internal.h:57

tesseract::RowScratchRegisters::NonNullHypotheses
void NonNullHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:610

tesseract::RowInfo::lword_likely_ends_idea
bool lword_likely_ends_idea
Definition: paragraphs.h:73

PAGE_RES_IT
Definition: pageres.h:656

tesseract::RowScratchRegisters::lindent_
int lindent_
Definition: paragraphs_internal.h:178

tesseract::ConvertHypothesizedModelRunsToParagraphs
void ConvertHypothesizedModelRunsToParagraphs(int debug_level, const GenericVector< RowScratchRegisters > &rows, GenericVector< PARA * > *row_owners, ParagraphTheory *theory)
Definition: paragraphs.cpp:2041

unicharset.h

tesseract::ModelStrongEvidence
void ModelStrongEvidence(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, bool allow_flush_models, ParagraphTheory *theory)
Definition: paragraphs.cpp:1900

tprintf
#define tprintf(...)
Definition: tprintf.h:31

tesseract::RIL_BLOCK
Definition: publictypes.h:208

UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463

tesseract::IsTerminalPunct
bool IsTerminalPunct(int ch)
Definition: paragraphs.cpp:205

tesseract::IsDigitLike
bool IsDigitLike(int ch)
Definition: paragraphs.cpp:197

tesseract::GeometricClassifierState::eop_threshold
int eop_threshold
Definition: paragraphs.cpp:955

tesseract::InitializeRowInfo
void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info)
Definition: paragraphs.cpp:2411

tesseract::Cluster::Cluster
Cluster()
Definition: paragraphs.cpp:644

tesseract::GeometricClassifierState::GeometricClassifierState
GeometricClassifierState(int dbg_level, GenericVector< RowScratchRegisters > *r, int r_start, int r_end)
Definition: paragraphs.cpp:855

tesseract::GeometricClassifierState::IsFullRow
bool IsFullRow(int i) const
Definition: paragraphs.cpp:899

tesseract::RowInfo::lword_likely_starts_idea
bool lword_likely_starts_idea
Definition: paragraphs.h:72

tesseract::ParagraphModelByOutline
ParagraphModel ParagraphModelByOutline(int debug_level, const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance)
Definition: paragraphs.cpp:1793

STATS
Definition: statistc.h:33

tesseract::UnicodeFor
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
Definition: paragraphs.cpp:274

GenericVector::back
T & back() const
Definition: genericvector.h:668

GenericVector::sort
void sort()
Definition: genericvector.h:998

tesseract::PageIterator::Empty
bool Empty(PageIteratorLevel level) const
Definition: pageiterator.cpp:348

tesseract::UnicodeSpanSkipper::SkipDigits
int SkipDigits(int pos)
Definition: paragraphs.cpp:307

ROW::has_drop_cap
bool has_drop_cap() const
Definition: ocrrow.h:108

tesseract::kCrownRight
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:47

ROW::rmargin
inT16 rmargin() const
Definition: ocrrow.h:101

tesseract::ParagraphTheory::NonCenteredModels
void NonCenteredModels(SetOfModels *models)
Definition: paragraphs.cpp:1250

tesseract::GeometricClassifierState::FirstWordWouldHaveFit
bool FirstWordWouldHaveFit(int row_a, int row_b)
Definition: paragraphs.cpp:910

tesseract::JUSTIFICATION_CENTER
Definition: publictypes.h:242

STATS::add
void add(inT32 value, inT32 count)
Definition: statistc.cpp:104

tesseract::PageIterator::Next
virtual bool Next(PageIteratorLevel level)
Definition: pageiterator.cpp:146

tesseract::LikelyListMarkUnicode
bool LikelyListMarkUnicode(int ch)
Definition: paragraphs.cpp:328

tesseract::RowInfo::pix_ldistance
int pix_ldistance
Definition: paragraphs.h:49

tesseract::RowInfo::pix_rdistance
int pix_rdistance
Definition: paragraphs.h:50

tesseract::RowInfo::average_interword_space
int average_interword_space
Definition: paragraphs.h:52

PARA::is_list_item
bool is_list_item
Definition: ocrpara.h:38

ParagraphModel::ValidBodyLine
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:63

ROW::x_height
float x_height() const
Definition: ocrrow.h:61

WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:160

tesseract::RIL_WORD
Definition: publictypes.h:211

tesseract::GeometricClassifierState::AlignTabs
const GenericVector< Cluster > & AlignTabs() const
Definition: paragraphs.cpp:882

STRING::length
inT32 length() const
Definition: strngs.cpp:188

tesseract::ResultIterator::GetUTF8Text
virtual char * GetUTF8Text(PageIteratorLevel level) const
Definition: resultiterator.cpp:556

tesseract::RowScratchRegisters::rmargin_
int rmargin_
Definition: paragraphs_internal.h:180

tesseract::StrongModel
bool StrongModel(const ParagraphModel *model)
Definition: paragraphs_internal.h:75

tesseract::GeometricClassifierState::Model
ParagraphModel Model() const
Definition: paragraphs.cpp:923

tesseract::GeometricClassifierState::left_tabs
GenericVector< Cluster > left_tabs
Definition: paragraphs.cpp:945

tesseract::RightWordAttributes
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:441

UpdateRange
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:125

tesseract::GeometricClassifierState::first_indent
int first_indent
Definition: paragraphs.cpp:951

tesseract::ParagraphModelSmearer::ParagraphModelSmearer
ParagraphModelSmearer(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
Definition: paragraphs.cpp:1309

tesseract::SetOfModels
GenericVectorEqEq< const ParagraphModel * > SetOfModels
Definition: paragraphs_internal.h:94

tesseract::ValidBodyLine
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
Definition: paragraphs.cpp:1277

WERD_CHOICE
Definition: ratngs.h:271

tesseract::RowInfo::rword_likely_ends_idea
bool rword_likely_ends_idea
Definition: paragraphs.h:77

ParagraphModel::first_indent
int first_indent() const
Definition: ocrpara.h:168

POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:52

tesseract::RowInfo::text
STRING text
Definition: paragraphs.h:43

tesseract::MutableIterator
Definition: mutableiterator.h:44

WERD_RES::AnyRtlCharsInWord
bool AnyRtlCharsInWord() const
Definition: pageres.h:372

tesseract::Interval
Definition: paragraphs.cpp:2122

tesseract::RowScratchRegisters::AppendDebugHeaderFields
static void AppendDebugHeaderFields(GenericVector< STRING > *header)
Definition: paragraphs.cpp:475

tesseract::GeometricClassifierState
Definition: paragraphs.cpp:854

tprintf.h

tesseract::RowScratchRegisters::StrongHypotheses
void StrongHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:603

tesseract::RowInfo
Definition: paragraphs.h:40

paragraphs_internal.h

tesseract::LT_START
Definition: paragraphs_internal.h:55

BLOCK_RES::block
BLOCK * block
Definition: pageres.h:99

ClipToRange
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115

tesseract::LeftoverSegments
void LeftoverSegments(const GenericVector< RowScratchRegisters > &rows, GenericVector< Interval > *to_fix, int row_start, int row_end)
Definition: paragraphs.cpp:2181

tesseract::RowScratchRegisters::AddStartLine
void AddStartLine(const ParagraphModel *model)
Definition: paragraphs.cpp:582

tesseract::MarkStrongEvidence
void MarkStrongEvidence(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end)
Definition: paragraphs.cpp:1830

ROW
Definition: ocrrow.h:32

mutableiterator.h

ParagraphModel
Definition: ocrpara.h:114

tesseract::RowScratchRegisters::Init
void Init(const RowInfo &row)
Definition: paragraphs.cpp:512

tesseract::DowngradeWeakestToCrowns
void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory, GenericVector< RowScratchRegisters > *rows)
Definition: paragraphs.cpp:1488

WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:524

tesseract::RowIsStranded
bool RowIsStranded(const GenericVector< RowScratchRegisters > &rows, int row)
Definition: paragraphs.cpp:2139

PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:739

tesseract::RecomputeMarginsAndClearHypotheses
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
Definition: paragraphs.cpp:1558

tesseract::RowScratchRegisters::SetStartLine
void SetStartLine()
Definition: paragraphs.cpp:562

ParagraphModel::ValidFirstLine
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:46

PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:713

ROW::lmargin
inT16 lmargin() const
Definition: ocrrow.h:98

tesseract::RowInfo::lword_indicates_list_item
bool lword_indicates_list_item
Definition: paragraphs.h:71

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470

STATS::median
double median() const
Definition: statistc.cpp:243

tesseract::ParagraphTheory::AddModel
const ParagraphModel * AddModel(const ParagraphModel &model)
Definition: paragraphs.cpp:1214

tesseract::UnicodeSpanSkipper
Definition: paragraphs.cpp:282

tesseract::FirstWordWouldHaveFit
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
Definition: paragraphs.cpp:1621

tesseract::LikelyListMark
bool LikelyListMark(const STRING &word)
Definition: paragraphs.cpp:262

tesseract::LineType
LineType
Definition: paragraphs_internal.h:54

tesseract::RowScratchRegisters::StartHypotheses
void StartHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:596

tesseract::MarkRowsWithModel
void MarkRowsWithModel(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, const ParagraphModel *model, bool ltr, int eop_threshold)
Definition: paragraphs.cpp:807

tesseract::RowScratchRegisters::AddBodyLine
void AddBodyLine(const ParagraphModel *model)
Definition: paragraphs.cpp:589

tesseract::RowScratchRegisters::ri_
const RowInfo * ri_
Definition: paragraphs_internal.h:170

tesseract::RIL_SYMBOL
Definition: publictypes.h:212

tesseract::CanonicalizeDetectionResults
void CanonicalizeDetectionResults(GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs)
Definition: paragraphs.cpp:2232

BLOCK::para_list
PARA_LIST * para_list()
Definition: ocrblock.h:128

tesseract::JUSTIFICATION_UNKNOWN
Definition: publictypes.h:240

STATS::ile
double ile(double frac) const
Definition: statistc.cpp:177

tesseract::RowScratchRegisters::SetUnknown
void SetUnknown()
Definition: paragraphs_internal.h:123

tesseract::RowInfo::has_leaders
bool has_leaders
Definition: paragraphs.h:47

tesseract::GeometricClassifierState::AssumeLeftJustification
void AssumeLeftJustification()
Definition: paragraphs.cpp:871

NearlyEqual
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:148

rect.h

tesseract::GeometricClassifierState::AlignsideTabIndex
int AlignsideTabIndex(int row_idx) const
Definition: paragraphs.cpp:904

tesseract::LT_MULTIPLE
Definition: paragraphs_internal.h:58

WERD_CHOICE::unichar_id
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312

UNICHARSET::id_to_unichar
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266

tesseract::GeometricClassifierState::row_start
int row_start
Definition: paragraphs.cpp:933

UNICHAR::first_uni
int first_uni() const
Definition: unichar.cpp:97

tesseract::GeometricClassifierState::AssumeRightJustification
void AssumeRightJustification()
Definition: paragraphs.cpp:876

BLOCK
Definition: ocrblock.h:30

WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:192

W_REP_CHAR
Definition: werd.h:41

tesseract::Interval::Interval
Interval(int b, int e)
Definition: paragraphs.cpp:2124

tesseract::GeometricClassifierState::ltr
bool ltr
Definition: paragraphs.cpp:941

GenericVector::init_to_size
void init_to_size(int size, T t)
Definition: genericvector.h:646

tesseract::RIL_TEXTLINE
Definition: publictypes.h:210

tesseract::RowInfo::lword_text
STRING lword_text
Definition: paragraphs.h:58

PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:736

GenericVectorEqEq
Definition: resultiterator.h:29

paragraphs.h

tesseract::ResultIterator::IsAtFinalElement
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
Definition: resultiterator.cpp:532

tesseract::SkipChars
const char * SkipChars(const char *str, const char *toskip)
Definition: paragraphs.cpp:210

tesseract::Interval::Interval
Interval()
Definition: paragraphs.cpp:2123

tesseract::RowInfo::ltr
bool ltr
Definition: paragraphs.h:44

PARA
Definition: ocrpara.h:29

PAGE_RES_IT::restart_row
WERD_RES * restart_row()
Definition: pageres.cpp:1636

tesseract::GeometricClassifierState::OffsideTabs
const GenericVector< Cluster > & OffsideTabs() const
Definition: paragraphs.cpp:892

tesseract::RowInfo::pix_xheight
float pix_xheight
Definition: paragraphs.h:51

ParagraphModel::justification
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:164

STRING::size
inT32 size() const
Definition: strngs.h:66

statistc.h

tesseract::kRLE
const char * kRLE
Definition: unicodes.cpp:29

tesseract::ParagraphJustification
ParagraphJustification
Definition: publictypes.h:239

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:33

tesseract::SimpleClusterer
Definition: paragraphs.cpp:651

UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477

GenericVector::clear
void clear()
Definition: genericvector.h:806

tesseract::RowInfo::rword_likely_starts_idea
bool rword_likely_starts_idea
Definition: paragraphs.h:76

UNICHAR
Definition: unichar.h:52

tesseract::kPDF
const char * kPDF
Definition: unicodes.cpp:30

tesseract::RowInfo::lword_box
TBOX lword_box
Definition: paragraphs.h:55

tesseract::LT_BODY
Definition: paragraphs_internal.h:56

helpers.h

WERD_RES::word
WERD * word
Definition: pageres.h:175

tesseract::RowScratchRegisters::UniqueBodyHypothesis
const ParagraphModel * UniqueBodyHypothesis() const
Definition: paragraphs.cpp:623

GenericVector::contains
bool contains(T object) const
Definition: genericvector.h:731

GenericVector::push_back_new
int push_back_new(T object)
Definition: genericvector.h:747

GenericVector::empty
bool empty() const
Definition: genericvector.h:84

tesseract::RowScratchRegisters::DiscardNonMatchingHypotheses
void DiscardNonMatchingHypotheses(const SetOfModels &models)
Definition: paragraphs.cpp:630

tesseract::kCrownLeft
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:45

tesseract::Interval::end
int end
Definition: paragraphs.cpp:2127

tesseract::RowInfo::rword_text
STRING rword_text
Definition: paragraphs.h:59

GenericVector::remove
void remove(int index)
Definition: genericvector.h:704

tesseract::RowScratchRegisters::AppendDebugInfo
void AppendDebugInfo(const ParagraphTheory &theory, GenericVector< STRING > *dbg) const
Definition: paragraphs.cpp:481

tesseract::SimpleClusterer::GetClusters
void GetClusters(GenericVector< Cluster > *clusters)
Definition: paragraphs.cpp:675

tesseract::MutableIterator::PageResIt
const PAGE_RES_IT * PageResIt() const
Definition: mutableiterator.h:59

tesseract::DetectParagraphs
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel * > *models)
Definition: paragraphs.cpp:2264

TBOX::width
inT16 width() const
Definition: rect.h:111

tesseract::GeometricClassifierState::debug_level
int debug_level
Definition: paragraphs.cpp:928

tesseract::LikelyParagraphStart
bool LikelyParagraphStart(const RowScratchRegisters &before, const RowScratchRegisters &after)
Definition: paragraphs.cpp:1672

tesseract::SeparateSimpleLeaderLines
void SeparateSimpleLeaderLines(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
Definition: paragraphs.cpp:2025

tesseract::UnicodeSpanSkipper::UnicodeSpanSkipper
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
Definition: paragraphs.cpp:284

tesseract
Definition: baseapi.cpp:83

PARA::is_very_first_or_continuation
bool is_very_first_or_continuation
Definition: ocrpara.h:43

count
int count(LIST var_list)
Definition: oldlist.cpp:108

ParagraphModel::is_flush
bool is_flush() const
Definition: ocrpara.h:171

tesseract::GeometricClassifierState::Fail
void Fail(int min_debug_level, const char *why) const
Definition: paragraphs.cpp:917

ROW_RES::row
ROW * row
Definition: pageres.h:127

tesseract::RowInfo::has_drop_cap
bool has_drop_cap
Definition: paragraphs.h:48

tesseract::ParagraphTheory
Definition: paragraphs_internal.h:189

tesseract::RtlEmbed
STRING RtlEmbed(const STRING &word, bool rtlify)
Definition: paragraphs.cpp:121

UNICHARSET
Definition: unicharset.h:139

tesseract::RowScratchRegisters::rindent_
int rindent_
Definition: paragraphs_internal.h:179

WERD::flag
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128

tesseract::TextSupportsBreak
bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after)
Definition: paragraphs.cpp:1661

strngs.h

tesseract::PageIterator::IsAtFinalElement
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
Definition: pageiterator.cpp:208

STRING
Definition: strngs.h:44

tesseract::ParagraphTheory::DiscardUnusedModels
void DiscardUnusedModels(const SetOfModels &used_models)
Definition: paragraphs.cpp:1225

tesseract::RowScratchRegisters::GetLineType
LineType GetLineType() const
Definition: paragraphs.cpp:520

WERD_RES::AnyLtrCharsInWord
bool AnyLtrCharsInWord() const
Definition: pageres.h:389

tesseract::SkipOne
const char * SkipOne(const char *str, const char *toskip)
Definition: paragraphs.cpp:220

NULL
#define NULL
Definition: host.h:144

tesseract::SimpleClusterer::Add
void Add(int value)
Definition: paragraphs.cpp:655

tesseract::CrownCompatible
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
Definition: paragraphs.cpp:1288

tesseract::Cluster::count
int count
Definition: paragraphs.cpp:648

tesseract::ResultIterator::Next
virtual bool Next(PageIteratorLevel level)
Definition: resultiterator.cpp:421

tesseract::AsciiLikelyListItem
bool AsciiLikelyListItem(const STRING &word)
Definition: paragraphs.cpp:267

tesseract::ParagraphTheory::Fits
const ParagraphModel * Fits(const GenericVector< RowScratchRegisters > *rows, int start, int end) const
Definition: paragraphs.cpp:1239

PARA::model
const ParagraphModel * model
Definition: ocrpara.h:36

tesseract::GeometricClassify
void GeometricClassify(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
Definition: paragraphs.cpp:1077

GenericVector
Definition: baseapi.h:41

tesseract::RowInfo::num_words
int num_words
Definition: paragraphs.h:54

tesseract::SimpleClusterer::size
int size() const
Definition: paragraphs.cpp:656

tesseract::GeometricClassifierState::body_indent
int body_indent
Definition: paragraphs.cpp:952

STRING::string
const char * string() const
Definition: strngs.cpp:193

tesseract::RowScratchRegisters::OffsideIndent
int OffsideIndent(tesseract::ParagraphJustification just) const
Definition: paragraphs_internal.h:146

tesseract::InterwordSpace
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
Definition: paragraphs.cpp:1598

PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59

ROW::set_para
void set_para(PARA *p)
Definition: ocrrow.h:112

tesseract::ClosestCluster
int ClosestCluster(const GenericVector< Cluster > &clusters, int value)
Definition: paragraphs.cpp:665

GenericVector::get_index
int get_index(T object) const
Definition: genericvector.h:720

tesseract::GeometricClassifierState::row_end
int row_end
Definition: paragraphs.cpp:934

tesseract::PageIterator
Definition: pageiterator.h:52

tesseract::UniLikelyListItem
bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd)
Definition: paragraphs.cpp:357

ocrpara.h

PARA::has_drop_cap
bool has_drop_cap
Definition: ocrpara.h:46

tesseract::ParagraphModelSmearer::Smear
void Smear()
Definition: paragraphs.cpp:1356

tesseract::StrongEvidenceClassify
void StrongEvidenceClassify(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
Definition: paragraphs.cpp:1995

tesseract::GeometricClassifierState::just
tesseract::ParagraphJustification just
Definition: paragraphs.cpp:949

tesseract::GeometricClassifierState::tolerance
int tolerance
Definition: paragraphs.cpp:937

genericvector.h

tesseract::IsLatinLetter
bool IsLatinLetter(int ch)
Definition: paragraphs.cpp:193

tesseract::Cluster::Cluster
Cluster(int cen, int num)
Definition: paragraphs.cpp:645

tesseract::GeometricClassifierState::right_tabs
GenericVector< Cluster > right_tabs
Definition: paragraphs.cpp:946

tesseract::ParagraphTheory::IndexOf
int IndexOf(const ParagraphModel *model) const
Definition: paragraphs.cpp:1258