Main Page
Related Pages
Modules
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Modules
Pages
paragraphs.h
Go to the documentation of this file.
1
/**********************************************************************
2
* File: paragraphs.h
3
* Description: Paragraph Detection data structures.
4
* Author: David Eger
5
* Created: 25 February 2011
6
*
7
* (C) Copyright 2011, Google Inc.
8
** Licensed under the Apache License, Version 2.0 (the "License");
9
** you may not use this file except in compliance with the License.
10
** You may obtain a copy of the License at
11
** http://www.apache.org/licenses/LICENSE-2.0
12
** Unless required by applicable law or agreed to in writing, software
13
** distributed under the License is distributed on an "AS IS" BASIS,
14
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
** See the License for the specific language governing permissions and
16
** limitations under the License.
17
*
18
**********************************************************************/
19
20
#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
21
#define TESSERACT_CCMAIN_PARAGRAPHS_H_
22
23
#include "
rect.h
"
24
#include "
ocrpara.h
"
25
#include "
genericvector.h
"
26
#include "
strngs.h
"
27
28
29
class
WERD
;
30
class
UNICHARSET
;
31
32
namespace
tesseract
{
33
34
class
MutableIterator;
35
36
// This structure captures all information needed about a text line for the
37
// purposes of paragraph detection. It is meant to be exceedingly light-weight
38
// so that we can easily test paragraph detection independent of the rest of
39
// Tesseract.
40
class
RowInfo
{
41
public
:
42
// Constant data derived from Tesseract output.
43
STRING
text
;
// the full UTF-8 text of the line.
44
bool
ltr
;
// whether the majority of the text is left-to-right
45
// TODO(eger) make this more fine-grained.
46
47
bool
has_leaders
;
// does the line contain leader dots (.....)?
48
bool
has_drop_cap
;
// does the line have a drop cap?
49
int
pix_ldistance
;
// distance to the left pblock boundary in pixels
50
int
pix_rdistance
;
// distance to the right pblock boundary in pixels
51
float
pix_xheight
;
// guessed xheight for the line
52
int
average_interword_space
;
// average space between words in pixels.
53
54
int
num_words
;
55
TBOX
lword_box
;
// in normalized (horiz text rows) space
56
TBOX
rword_box
;
// in normalized (horiz text rows) space
57
58
STRING
lword_text
;
// the UTF-8 text of the leftmost werd
59
STRING
rword_text
;
// the UTF-8 text of the rightmost werd
60
61
// The text of a paragraph typically starts with the start of an idea and
62
// ends with the end of an idea. Here we define paragraph as something that
63
// may have a first line indent and a body indent which may be different.
64
// Typical words that start an idea are:
65
// 1. Words in western scripts that start with
66
// a capital letter, for example "The"
67
// 2. Bulleted or numbered list items, for
68
// example "2."
69
// Typical words which end an idea are words ending in punctuation marks. In
70
// this vocabulary, each list item is represented as a paragraph.
71
bool
lword_indicates_list_item
;
72
bool
lword_likely_starts_idea
;
73
bool
lword_likely_ends_idea
;
74
75
bool
rword_indicates_list_item
;
76
bool
rword_likely_starts_idea
;
77
bool
rword_likely_ends_idea
;
78
};
79
80
// Main entry point for Paragraph Detection Algorithm.
81
//
82
// Given a set of equally spaced textlines (described by row_infos),
83
// Split them into paragraphs. See http://goto/paragraphstalk
84
//
85
// Output:
86
// row_owners - one pointer for each row, to the paragraph it belongs to.
87
// paragraphs - this is the actual list of PARA objects.
88
// models - the list of paragraph models referenced by the PARA objects.
89
// caller is responsible for deleting the models.
90
void
DetectParagraphs
(
int
debug_level,
91
GenericVector<RowInfo>
*row_infos,
92
GenericVector<PARA *>
*row_owners,
93
PARA_LIST *paragraphs,
94
GenericVector<ParagraphModel *>
*models);
95
96
// Given a MutableIterator to the start of a block, run DetectParagraphs on
97
// that block and commit the results to the underlying ROW and BLOCK structs,
98
// saving the ParagraphModels in models. Caller owns the models.
99
// We use unicharset during the function to answer questions such as "is the
100
// first letter of this word upper case?"
101
void
DetectParagraphs
(
int
debug_level,
102
bool
after_text_recognition,
103
const
MutableIterator
*block_start,
104
GenericVector<ParagraphModel *>
*models);
105
106
}
// namespace
107
108
#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_
tesseract::RowInfo::rword_indicates_list_item
bool rword_indicates_list_item
Definition:
paragraphs.h:75
tesseract::RowInfo::rword_box
TBOX rword_box
Definition:
paragraphs.h:56
tesseract::RowInfo::lword_likely_ends_idea
bool lword_likely_ends_idea
Definition:
paragraphs.h:73
tesseract::RowInfo::lword_likely_starts_idea
bool lword_likely_starts_idea
Definition:
paragraphs.h:72
tesseract::RowInfo::pix_ldistance
int pix_ldistance
Definition:
paragraphs.h:49
tesseract::RowInfo::pix_rdistance
int pix_rdistance
Definition:
paragraphs.h:50
tesseract::RowInfo::average_interword_space
int average_interword_space
Definition:
paragraphs.h:52
tesseract::RowInfo::rword_likely_ends_idea
bool rword_likely_ends_idea
Definition:
paragraphs.h:77
tesseract::RowInfo::text
STRING text
Definition:
paragraphs.h:43
tesseract::MutableIterator
Definition:
mutableiterator.h:44
tesseract::RowInfo
Definition:
paragraphs.h:40
tesseract::RowInfo::lword_indicates_list_item
bool lword_indicates_list_item
Definition:
paragraphs.h:71
tesseract::RowInfo::has_leaders
bool has_leaders
Definition:
paragraphs.h:47
rect.h
tesseract::RowInfo::lword_text
STRING lword_text
Definition:
paragraphs.h:58
tesseract::RowInfo::ltr
bool ltr
Definition:
paragraphs.h:44
tesseract::RowInfo::pix_xheight
float pix_xheight
Definition:
paragraphs.h:51
WERD
Definition:
werd.h:60
tesseract::RowInfo::rword_likely_starts_idea
bool rword_likely_starts_idea
Definition:
paragraphs.h:76
tesseract::RowInfo::lword_box
TBOX lword_box
Definition:
paragraphs.h:55
tesseract::RowInfo::rword_text
STRING rword_text
Definition:
paragraphs.h:59
tesseract::DetectParagraphs
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel * > *models)
Definition:
paragraphs.cpp:2264
tesseract
Definition:
baseapi.cpp:83
tesseract::RowInfo::has_drop_cap
bool has_drop_cap
Definition:
paragraphs.h:48
TBOX
Definition:
rect.h:30
UNICHARSET
Definition:
unicharset.h:139
strngs.h
STRING
Definition:
strngs.h:44
GenericVector
Definition:
baseapi.h:41
tesseract::RowInfo::num_words
int num_words
Definition:
paragraphs.h:54
ocrpara.h
genericvector.h
ccmain
paragraphs.h
Generated on Mon Jul 20 2015 18:37:46 by
1.8.8