tesseract
5.0.0-alpha-619-ge9db
paragraphs.h
Go to the documentation of this file.
1
/**********************************************************************
2
* File: paragraphs.h
3
* Description: Paragraph Detection data structures.
4
* Author: David Eger
5
* Created: 25 February 2011
6
*
7
* (C) Copyright 2011, Google Inc.
8
** Licensed under the Apache License, Version 2.0 (the "License");
9
** you may not use this file except in compliance with the License.
10
** You may obtain a copy of the License at
11
** http://www.apache.org/licenses/LICENSE-2.0
12
** Unless required by applicable law or agreed to in writing, software
13
** distributed under the License is distributed on an "AS IS" BASIS,
14
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
** See the License for the specific language governing permissions and
16
** limitations under the License.
17
*
18
**********************************************************************/
19
20
#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
21
#define TESSERACT_CCMAIN_PARAGRAPHS_H_
22
23
#include "
rect.h
"
// for TBOX
24
#include <
tesseract/strngs.h
>
// for STRING
25
26
class
PARA_LIST;
27
class
ParagraphModel
;
28
29
struct
PARA
;
30
31
template
<
typename
T>
class
GenericVector
;
32
33
namespace
tesseract
{
34
35
class
MutableIterator;
36
37
// This structure captures all information needed about a text line for the
38
// purposes of paragraph detection. It is meant to be exceedingly light-weight
39
// so that we can easily test paragraph detection independent of the rest of
40
// Tesseract.
41
class
RowInfo
{
42
public
:
43
// Constant data derived from Tesseract output.
44
STRING
text
;
// the full UTF-8 text of the line.
45
bool
ltr
;
// whether the majority of the text is left-to-right
46
// TODO(eger) make this more fine-grained.
47
48
bool
has_leaders
;
// does the line contain leader dots (.....)?
49
bool
has_drop_cap
;
// does the line have a drop cap?
50
int
pix_ldistance
;
// distance to the left pblock boundary in pixels
51
int
pix_rdistance
;
// distance to the right pblock boundary in pixels
52
float
pix_xheight
;
// guessed xheight for the line
53
int
average_interword_space
;
// average space between words in pixels.
54
55
int
num_words
;
56
TBOX
lword_box
;
// in normalized (horiz text rows) space
57
TBOX
rword_box
;
// in normalized (horiz text rows) space
58
59
STRING
lword_text
;
// the UTF-8 text of the leftmost werd
60
STRING
rword_text
;
// the UTF-8 text of the rightmost werd
61
62
// The text of a paragraph typically starts with the start of an idea and
63
// ends with the end of an idea. Here we define paragraph as something that
64
// may have a first line indent and a body indent which may be different.
65
// Typical words that start an idea are:
66
// 1. Words in western scripts that start with
67
// a capital letter, for example "The"
68
// 2. Bulleted or numbered list items, for
69
// example "2."
70
// Typical words which end an idea are words ending in punctuation marks. In
71
// this vocabulary, each list item is represented as a paragraph.
72
bool
lword_indicates_list_item
;
73
bool
lword_likely_starts_idea
;
74
bool
lword_likely_ends_idea
;
75
76
bool
rword_indicates_list_item
;
77
bool
rword_likely_starts_idea
;
78
bool
rword_likely_ends_idea
;
79
};
80
81
// Main entry point for Paragraph Detection Algorithm.
82
//
83
// Given a set of equally spaced textlines (described by row_infos),
84
// Split them into paragraphs. See http://goto/paragraphstalk
85
//
86
// Output:
87
// row_owners - one pointer for each row, to the paragraph it belongs to.
88
// paragraphs - this is the actual list of PARA objects.
89
// models - the list of paragraph models referenced by the PARA objects.
90
// caller is responsible for deleting the models.
91
void
DetectParagraphs
(
int
debug_level,
92
GenericVector<RowInfo>
*row_infos,
93
GenericVector<PARA *>
*row_owners,
94
PARA_LIST *paragraphs,
95
GenericVector<ParagraphModel *>
*models);
96
97
// Given a MutableIterator to the start of a block, run DetectParagraphs on
98
// that block and commit the results to the underlying ROW and BLOCK structs,
99
// saving the ParagraphModels in models. Caller owns the models.
100
// We use unicharset during the function to answer questions such as "is the
101
// first letter of this word upper case?"
102
void
DetectParagraphs
(
int
debug_level,
103
bool
after_text_recognition,
104
const
MutableIterator
*block_start,
105
GenericVector<ParagraphModel *>
*models);
106
107
}
// namespace
108
109
#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_
tesseract::RowInfo::has_leaders
bool has_leaders
Definition:
paragraphs.h:47
tesseract::RowInfo::rword_box
TBOX rword_box
Definition:
paragraphs.h:56
strngs.h
tesseract::RowInfo::rword_indicates_list_item
bool rword_indicates_list_item
Definition:
paragraphs.h:75
tesseract::RowInfo::lword_likely_starts_idea
bool lword_likely_starts_idea
Definition:
paragraphs.h:72
tesseract::RowInfo::pix_rdistance
int pix_rdistance
Definition:
paragraphs.h:50
tesseract::RowInfo::pix_xheight
float pix_xheight
Definition:
paragraphs.h:51
tesseract::RowInfo::lword_indicates_list_item
bool lword_indicates_list_item
Definition:
paragraphs.h:71
STRING
Definition:
strngs.h:45
tesseract::RowInfo::lword_likely_ends_idea
bool lword_likely_ends_idea
Definition:
paragraphs.h:73
tesseract::RowInfo::average_interword_space
int average_interword_space
Definition:
paragraphs.h:52
rect.h
tesseract::RowInfo::rword_likely_starts_idea
bool rword_likely_starts_idea
Definition:
paragraphs.h:76
tesseract::RowInfo::lword_text
STRING lword_text
Definition:
paragraphs.h:58
ParagraphModel
Definition:
ocrpara.h:114
tesseract::RowInfo
Definition:
paragraphs.h:40
tesseract::RowInfo::rword_likely_ends_idea
bool rword_likely_ends_idea
Definition:
paragraphs.h:77
tesseract
Definition:
baseapi.h:65
tesseract::RowInfo::num_words
int num_words
Definition:
paragraphs.h:54
tesseract::RowInfo::has_drop_cap
bool has_drop_cap
Definition:
paragraphs.h:48
GenericVector
Definition:
baseapi.h:40
tesseract::RowInfo::text
STRING text
Definition:
paragraphs.h:43
tesseract::RowInfo::lword_box
TBOX lword_box
Definition:
paragraphs.h:55
tesseract::RowInfo::rword_text
STRING rword_text
Definition:
paragraphs.h:59
tesseract::MutableIterator
Definition:
mutableiterator.h:44
PARA
Definition:
ocrpara.h:29
tesseract::DetectParagraphs
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel * > *models)
Definition:
paragraphs.cpp:2284
tesseract::RowInfo::ltr
bool ltr
Definition:
paragraphs.h:44
tesseract::RowInfo::pix_ldistance
int pix_ldistance
Definition:
paragraphs.h:49
TBOX
Definition:
rect.h:33
src
ccmain
paragraphs.h
Generated on Thu Jan 30 2020 14:22:19 for tesseract by
1.8.16