tesseract  5.0.0-alpha-619-ge9db
paragraphs_internal.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: paragraphs_internal.h
3  * Description: Paragraph Detection internal data structures.
4  * Author: David Eger
5  *
6  * (C) Copyright 2011, Google Inc.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
20 #define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
21 
22 #include "paragraphs.h"
23 #include <tesseract/publictypes.h> // for ParagraphJustification
24 
25 // NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
26 // DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
27 
28 class UNICHARSET;
29 class WERD_CHOICE;
30 
31 namespace tesseract {
32 
33 // Return whether the given word is likely to be a list item start word.
34 bool AsciiLikelyListItem(const STRING &word);
35 
36 // Return the first Unicode Codepoint from werd[pos].
37 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
38 
39 // Set right word attributes given either a unicharset and werd or a utf8
40 // string.
41 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
42  const STRING &utf8,
43  bool *is_list, bool *starts_idea, bool *ends_idea);
44 
45 // Set left word attributes given either a unicharset and werd or a utf8 string.
46 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
47  const STRING &utf8,
48  bool *is_list, bool *starts_idea, bool *ends_idea);
49 
50 enum LineType {
51  LT_START = 'S', // First line of a paragraph.
52  LT_BODY = 'C', // Continuation line of a paragraph.
53  LT_UNKNOWN = 'U', // No clues.
54  LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
55 };
56 
57 // The first paragraph in a page of body text is often un-indented.
58 // This is a typographic convention which is common to indicate either that:
59 // (1) The paragraph is the continuation of a previous paragraph, or
60 // (2) The paragraph is the first paragraph in a chapter.
61 //
62 // I refer to such paragraphs as "crown"s, and the output of the paragraph
63 // detection algorithm attempts to give them the same paragraph model as
64 // the rest of the body text.
65 //
66 // Nonetheless, while building hypotheses, it is useful to mark the lines
67 // of crown paragraphs temporarily as crowns, either aligned left or right.
68 extern const ParagraphModel *kCrownLeft;
69 extern const ParagraphModel *kCrownRight;
70 
71 inline bool StrongModel(const ParagraphModel *model) {
72  return model != nullptr && model != kCrownLeft && model != kCrownRight;
73 }
74 
76  LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {}
77  LineHypothesis(LineType line_type, const ParagraphModel *m)
78  : ty(line_type), model(m) {}
79  LineHypothesis(const LineHypothesis &other)
80  : ty(other.ty), model(other.model) {}
81 
82  // Copy assignment operator.
83  LineHypothesis& operator=(const LineHypothesis& other) {
84  ty = other.ty;
85  model = other.model;
86  return *this;
87  }
88 
89  bool operator==(const LineHypothesis &other) const {
90  return ty == other.ty && model == other.model;
91  }
92 
94  const ParagraphModel *model;
95 };
96 
97 class ParagraphTheory; // Forward Declaration
98 
100 
101 // Row Scratch Registers are data generated by the paragraph detection
102 // algorithm based on a RowInfo input.
103 class RowScratchRegisters {
104  public:
105  // We presume row will outlive us.
106  void Init(const RowInfo &row);
107 
108  LineType GetLineType() const;
109 
110  LineType GetLineType(const ParagraphModel *model) const;
111 
112  // Mark this as a start line type, sans model. This is useful for the
113  // initial marking of probable body lines or paragraph start lines.
114  void SetStartLine();
115 
116  // Mark this as a body line type, sans model. This is useful for the
117  // initial marking of probably body lines or paragraph start lines.
118  void SetBodyLine();
119 
120  // Record that this row fits as a paragraph start line in the given model,
121  void AddStartLine(const ParagraphModel *model);
122  // Record that this row fits as a paragraph body line in the given model,
123  void AddBodyLine(const ParagraphModel *model);
124 
125  // Clear all hypotheses about this line.
126  void SetUnknown() { hypotheses_.truncate(0); }
127 
128  // Append all hypotheses of strong models that match this row as a start.
129  void StartHypotheses(SetOfModels *models) const;
130 
131  // Append all hypotheses of strong models matching this row.
132  void StrongHypotheses(SetOfModels *models) const;
133 
134  // Append all hypotheses for this row.
135  void NonNullHypotheses(SetOfModels *models) const;
136 
137  // Discard any hypotheses whose model is not in the given list.
138  void DiscardNonMatchingHypotheses(const SetOfModels &models);
139 
140  // If we have only one hypothesis and that is that this line is a paragraph
141  // start line of a certain model, return that model. Else return nullptr.
142  const ParagraphModel *UniqueStartHypothesis() const;
143 
144  // If we have only one hypothesis and that is that this line is a paragraph
145  // body line of a certain model, return that model. Else return nullptr.
146  const ParagraphModel *UniqueBodyHypothesis() const;
147 
148  // Return the indentation for the side opposite of the aligned side.
150  switch (just) {
153  default: return lindent_ > rindent_ ? lindent_ : rindent_;
154  }
155  }
156 
157  // Return the indentation for the side the text is aligned to.
159  switch (just) {
162  default: return lindent_ > rindent_ ? lindent_ : rindent_;
163  }
164  }
165 
166  // Append header fields to a vector of row headings.
167  static void AppendDebugHeaderFields(GenericVector<STRING> *header);
168 
169  // Append data for this row to a vector of debug strings.
170  void AppendDebugInfo(const ParagraphTheory &theory,
171  GenericVector<STRING> *dbg) const;
172 
173  const RowInfo *ri_;
174 
175  // These four constants form a horizontal box model for the white space
176  // on the edges of each line. At each point in the algorithm, the following
177  // shall hold:
178  // ri_->pix_ldistance = lmargin_ + lindent_
179  // ri_->pix_rdistance = rindent_ + rmargin_
180  int lmargin_;
181  int lindent_;
182  int rindent_;
183  int rmargin_;
184 
185  private:
186  // Hypotheses of either LT_START or LT_BODY
188 };
189 
190 // A collection of convenience functions for wrapping the set of
191 // Paragraph Models we believe correctly model the paragraphs in the image.
192 class ParagraphTheory {
193  public:
194  // We presume models will outlive us, and that models will take ownership
195  // of any ParagraphModel *'s we add.
197  : models_(models) {}
198  GenericVector<ParagraphModel *> &models() { return *models_; }
199  const GenericVector<ParagraphModel *> &models() const { return *models_; }
200 
201  // Return an existing model if one that is Comparable() can be found.
202  // Else, allocate a new copy of model to save and return a pointer to it.
203  const ParagraphModel *AddModel(const ParagraphModel &model);
204 
205  // Discard any models we've made that are not in the list of used models.
206  void DiscardUnusedModels(const SetOfModels &used_models);
207 
208  // Return the set of all non-centered models.
210 
211  // If any of the non-centered paragraph models we know about fit
212  // rows[start, end), return it. Else nullptr.
214  int start, int end) const;
215 
216  int IndexOf(const ParagraphModel *model) const;
217 
218  private:
220  GenericVectorEqEq<ParagraphModel *> models_we_added_;
221 };
222 
224  int row, const ParagraphModel *model);
226  int row, const ParagraphModel *model);
228  int a, int b, const ParagraphModel *model);
229 
230 // A class for smearing Paragraph Model hypotheses to surrounding rows.
231 // The idea here is that StrongEvidenceClassify first marks only exceedingly
232 // obvious start and body rows and constructs models of them. Thereafter,
233 // we may have left over unmarked lines (mostly end-of-paragraph lines) which
234 // were too short to have much confidence about, but which fit the models we've
235 // constructed perfectly and which we ought to mark. This class is used to
236 // "smear" our models over the text.
237 class ParagraphModelSmearer {
238  public:
240  int row_start, int row_end,
241  ParagraphTheory *theory);
242 
243  // Smear forward paragraph models from existing row markings to subsequent
244  // text lines if they fit, and mark any thereafter still unmodeled rows
245  // with any model in the theory that fits them.
246  void Smear();
247 
248  private:
249  // Record in open_models_ for rows [start_row, end_row) the list of models
250  // currently open at each row.
251  // A model is still open in a row if some previous row has said model as a
252  // start hypothesis, and all rows since (including this row) would fit as
253  // either a body or start line in that model.
254  void CalculateOpenModels(int row_start, int row_end);
255 
256  SetOfModels &OpenModels(int row) {
257  return open_models_[row - row_start_ + 1];
258  }
259 
260  ParagraphTheory *theory_;
262  int row_start_;
263  int row_end_;
264 
265  // open_models_ corresponds to rows[start_row_ - 1, end_row_]
266  //
267  // open_models_: Contains models which there was an active (open) paragraph
268  // as of the previous line and for which the left and right
269  // indents admit the possibility that this text line continues
270  // to fit the same model.
271  // TODO(eger): Think about whether we can get rid of "Open" models and just
272  // use the current hypotheses on RowScratchRegisters.
273  GenericVector<SetOfModels> open_models_;
274 };
275 
276 // Clear all hypotheses about lines [start, end) and reset the margins to the
277 // percentile (0..100) value of the left and right row edges for this run of
278 // rows.
280  GenericVector<RowScratchRegisters> *rows, int start, int end,
281  int percentile);
282 
283 // Return the median inter-word space in rows[row_start, row_end).
285  int row_start, int row_end);
286 
287 // Return whether the first word on the after line can fit in the space at
288 // the end of the before line (knowing which way the text is aligned and read).
289 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
290  const RowScratchRegisters &after,
291  tesseract::ParagraphJustification justification);
292 
293 // Return whether the first word on the after line can fit in the space at
294 // the end of the before line (not knowing the text alignment).
295 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
296  const RowScratchRegisters &after);
297 
298 // Do rows[start, end) form a single instance of the given paragraph model?
300  int start, int end, const ParagraphModel *model);
301 
302 // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
303 // normalize each row_owner to point to an actual PARA, and output the
304 // paragraphs in order onto paragraphs.
306  GenericVector<PARA *> *row_owners,
307  PARA_LIST *paragraphs);
308 
309 } // namespace
310 
311 #endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
tesseract::RowScratchRegisters::lindent_
int lindent_
Definition: paragraphs_internal.h:180
tesseract::RowScratchRegisters::ri_
const RowInfo * ri_
Definition: paragraphs_internal.h:172
tesseract::RowScratchRegisters::OffsideIndent
int OffsideIndent(tesseract::ParagraphJustification just) const
Definition: paragraphs_internal.h:148
tesseract::ParagraphModelSmearer::ParagraphModelSmearer
ParagraphModelSmearer(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
Definition: paragraphs.cpp:1335
tesseract::LineHypothesis::LineHypothesis
LineHypothesis()
Definition: paragraphs_internal.h:75
tesseract::RowScratchRegisters::NonNullHypotheses
void NonNullHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:639
tesseract::UnicodeFor
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
Definition: paragraphs.cpp:303
tesseract::FirstWordWouldHaveFit
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
Definition: paragraphs.cpp:1646
tesseract::ParagraphTheory::models
GenericVector< ParagraphModel * > & models()
Definition: paragraphs_internal.h:197
tesseract::ValidBodyLine
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
Definition: paragraphs.cpp:1303
tesseract::RowScratchRegisters::AppendDebugInfo
void AppendDebugInfo(const ParagraphTheory &theory, GenericVector< STRING > *dbg) const
Definition: paragraphs.cpp:510
tesseract::ParagraphTheory::models
const GenericVector< ParagraphModel * > & models() const
Definition: paragraphs_internal.h:198
tesseract::JUSTIFICATION_RIGHT
Definition: publictypes.h:252
WERD_CHOICE
Definition: ratngs.h:261
tesseract::LineHypothesis::operator==
bool operator==(const LineHypothesis &other) const
Definition: paragraphs_internal.h:88
tesseract::RowScratchRegisters::StrongHypotheses
void StrongHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:632
tesseract::LeftWordAttributes
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:423
tesseract::ParagraphJustification
ParagraphJustification
Definition: publictypes.h:248
STRING
Definition: strngs.h:45
tesseract::ParagraphTheory::ParagraphTheory
ParagraphTheory(GenericVector< ParagraphModel * > *models)
Definition: paragraphs_internal.h:195
tesseract::RecomputeMarginsAndClearHypotheses
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
Definition: paragraphs.cpp:1583
tesseract::RowScratchRegisters::rmargin_
int rmargin_
Definition: paragraphs_internal.h:182
tesseract::CanonicalizeDetectionResults
void CanonicalizeDetectionResults(GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs)
Definition: paragraphs.cpp:2252
tesseract::RowScratchRegisters::SetBodyLine
void SetBodyLine()
Definition: paragraphs.cpp:601
tesseract::RowScratchRegisters::UniqueBodyHypothesis
const ParagraphModel * UniqueBodyHypothesis() const
Definition: paragraphs.cpp:652
tesseract::RightWordAttributes
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:470
tesseract::ParagraphTheory
Definition: paragraphs_internal.h:191
ParagraphModel
Definition: ocrpara.h:114
tesseract::RowInfo
Definition: paragraphs.h:40
tesseract::RowScratchRegisters::lmargin_
int lmargin_
Definition: paragraphs_internal.h:179
tesseract::RowScratchRegisters::AlignsideIndent
int AlignsideIndent(tesseract::ParagraphJustification just) const
Definition: paragraphs_internal.h:157
tesseract::JUSTIFICATION_LEFT
Definition: publictypes.h:250
tesseract::CrownCompatible
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
Definition: paragraphs.cpp:1314
tesseract::RowScratchRegisters::SetUnknown
void SetUnknown()
Definition: paragraphs_internal.h:125
tesseract::RowScratchRegisters::AddBodyLine
void AddBodyLine(const ParagraphModel *model)
Definition: paragraphs.cpp:618
tesseract::ParagraphTheory::AddModel
const ParagraphModel * AddModel(const ParagraphModel &model)
Definition: paragraphs.cpp:1240
publictypes.h
tesseract::RowScratchRegisters::rindent_
int rindent_
Definition: paragraphs_internal.h:181
tesseract::ParagraphTheory::Fits
const ParagraphModel * Fits(const GenericVector< RowScratchRegisters > *rows, int start, int end) const
Definition: paragraphs.cpp:1265
tesseract::RowsFitModel
bool RowsFitModel(const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
Definition: paragraphs.cpp:1826
UNICHARSET
Definition: unicharset.h:145
tesseract::RowScratchRegisters::Init
void Init(const RowInfo &row)
Definition: paragraphs.cpp:541
tesseract::ParagraphTheory::DiscardUnusedModels
void DiscardUnusedModels(const SetOfModels &used_models)
Definition: paragraphs.cpp:1251
tesseract::LineHypothesis::operator=
LineHypothesis & operator=(const LineHypothesis &other)
Definition: paragraphs_internal.h:82
tesseract::AsciiLikelyListItem
bool AsciiLikelyListItem(const STRING &word)
Definition: paragraphs.cpp:296
tesseract
Definition: baseapi.h:65
tesseract::RowScratchRegisters
Definition: paragraphs_internal.h:102
tesseract::kCrownLeft
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:69
GenericVectorEqEq
Definition: genericvector.h:640
tesseract::RowScratchRegisters::AppendDebugHeaderFields
static void AppendDebugHeaderFields(GenericVector< STRING > *header)
Definition: paragraphs.cpp:504
tesseract::LT_UNKNOWN
Definition: paragraphs_internal.h:52
paragraphs.h
tesseract::RowScratchRegisters::DiscardNonMatchingHypotheses
void DiscardNonMatchingHypotheses(const SetOfModels &models)
Definition: paragraphs.cpp:659
GenericVector< STRING >
tesseract::LineHypothesis::ty
LineType ty
Definition: paragraphs_internal.h:92
tesseract::ParagraphModelSmearer
Definition: paragraphs_internal.h:236
tesseract::RowScratchRegisters::SetStartLine
void SetStartLine()
Definition: paragraphs.cpp:591
tesseract::LineHypothesis
Definition: paragraphs_internal.h:74
tesseract::LineHypothesis::LineHypothesis
LineHypothesis(LineType line_type, const ParagraphModel *m)
Definition: paragraphs_internal.h:76
tesseract::ParagraphTheory::IndexOf
int IndexOf(const ParagraphModel *model) const
Definition: paragraphs.cpp:1284
tesseract::LT_BODY
Definition: paragraphs_internal.h:51
tesseract::RowScratchRegisters::UniqueStartHypothesis
const ParagraphModel * UniqueStartHypothesis() const
Definition: paragraphs.cpp:646
tesseract::RowScratchRegisters::GetLineType
LineType GetLineType() const
Definition: paragraphs.cpp:549
tesseract::ParagraphModelSmearer::Smear
void Smear()
Definition: paragraphs.cpp:1382
tesseract::LT_START
Definition: paragraphs_internal.h:50
tesseract::RowScratchRegisters::StartHypotheses
void StartHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:625
tesseract::LT_MULTIPLE
Definition: paragraphs_internal.h:53
tesseract::ParagraphTheory::NonCenteredModels
void NonCenteredModels(SetOfModels *models)
Definition: paragraphs.cpp:1276
tesseract::SetOfModels
GenericVectorEqEq< const ParagraphModel * > SetOfModels
Definition: paragraphs_internal.h:98
tesseract::InterwordSpace
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
Definition: paragraphs.cpp:1623
tesseract::RowScratchRegisters::AddStartLine
void AddStartLine(const ParagraphModel *model)
Definition: paragraphs.cpp:611
tesseract::LineType
LineType
Definition: paragraphs_internal.h:49
tesseract::kCrownRight
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:71
tesseract::StrongModel
bool StrongModel(const ParagraphModel *model)
Definition: paragraphs_internal.h:70
tesseract::ValidFirstLine
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
Definition: paragraphs.cpp:1292
tesseract::LineHypothesis::model
const ParagraphModel * model
Definition: paragraphs_internal.h:93