tesseract
5.0.0-alpha-619-ge9db
params_training_featdef.h
Go to the documentation of this file.
1
// File: params_training_featdef.h
3
// Description: Feature definitions for params training.
4
// Author: Rika Antonova
5
// Created: Mon Nov 28 11:26:42 PDT 2011
6
//
7
// (C) Copyright 2011, Google Inc.
8
// Licensed under the Apache License, Version 2.0 (the "License");
9
// you may not use this file except in compliance with the License.
10
// You may obtain a copy of the License at
11
// http://www.apache.org/licenses/LICENSE-2.0
12
// Unless required by applicable law or agreed to in writing, software
13
// distributed under the License is distributed on an "AS IS" BASIS,
14
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
// See the License for the specific language governing permissions and
16
// limitations under the License.
17
//
19
20
#ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
21
#define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
22
23
#include <
tesseract/genericvector.h
>
24
#include <
tesseract/strngs.h
>
25
26
namespace
tesseract
{
27
28
// Maximum number of unichars in the small and medium sized words
29
static
const
int
kMaxSmallWordUnichars = 3;
30
static
const
int
kMaxMediumWordUnichars = 6;
31
32
// Raw features extracted from a single OCR hypothesis.
33
// The features are normalized (by outline length or number of unichars as
34
// appropriate) real-valued quantities with unbounded range and
35
// unknown distribution.
36
// Normalization / binarization of these features is done at a later stage.
37
// Note: when adding new fields to this enum make sure to modify
38
// kParamsTrainingFeatureTypeName
39
enum
kParamsTrainingFeatureType
{
40
// Digits
41
PTRAIN_DIGITS_SHORT
,
// 0
42
PTRAIN_DIGITS_MED
,
// 1
43
PTRAIN_DIGITS_LONG
,
// 2
44
// Number or pattern (NUMBER_PERM, USER_PATTERN_PERM)
45
PTRAIN_NUM_SHORT
,
// 3
46
PTRAIN_NUM_MED
,
// 4
47
PTRAIN_NUM_LONG
,
// 5
48
// Document word (DOC_DAWG_PERM)
49
PTRAIN_DOC_SHORT
,
// 6
50
PTRAIN_DOC_MED
,
// 7
51
PTRAIN_DOC_LONG
,
// 8
52
// Word (SYSTEM_DAWG_PERM, USER_DAWG_PERM, COMPOUND_PERM)
53
PTRAIN_DICT_SHORT
,
// 9
54
PTRAIN_DICT_MED
,
// 10
55
PTRAIN_DICT_LONG
,
// 11
56
// Frequent word (FREQ_DAWG_PERM)
57
PTRAIN_FREQ_SHORT
,
// 12
58
PTRAIN_FREQ_MED
,
// 13
59
PTRAIN_FREQ_LONG
,
// 14
60
PTRAIN_SHAPE_COST_PER_CHAR
,
// 15
61
PTRAIN_NGRAM_COST_PER_CHAR
,
// 16
62
PTRAIN_NUM_BAD_PUNC
,
// 17
63
PTRAIN_NUM_BAD_CASE
,
// 18
64
PTRAIN_XHEIGHT_CONSISTENCY
,
// 19
65
PTRAIN_NUM_BAD_CHAR_TYPE
,
// 20
66
PTRAIN_NUM_BAD_SPACING
,
// 21
67
PTRAIN_NUM_BAD_FONT
,
// 22
68
PTRAIN_RATING_PER_CHAR
,
// 23
69
70
PTRAIN_NUM_FEATURE_TYPES
71
};
72
73
static
const
char
*
const
kParamsTrainingFeatureTypeName[] = {
74
"PTRAIN_DIGITS_SHORT"
,
// 0
75
"PTRAIN_DIGITS_MED"
,
// 1
76
"PTRAIN_DIGITS_LONG"
,
// 2
77
"PTRAIN_NUM_SHORT"
,
// 3
78
"PTRAIN_NUM_MED"
,
// 4
79
"PTRAIN_NUM_LONG"
,
// 5
80
"PTRAIN_DOC_SHORT"
,
// 6
81
"PTRAIN_DOC_MED"
,
// 7
82
"PTRAIN_DOC_LONG"
,
// 8
83
"PTRAIN_DICT_SHORT"
,
// 9
84
"PTRAIN_DICT_MED"
,
// 10
85
"PTRAIN_DICT_LONG"
,
// 11
86
"PTRAIN_FREQ_SHORT"
,
// 12
87
"PTRAIN_FREQ_MED"
,
// 13
88
"PTRAIN_FREQ_LONG"
,
// 14
89
"PTRAIN_SHAPE_COST_PER_CHAR"
,
// 15
90
"PTRAIN_NGRAM_COST_PER_CHAR"
,
// 16
91
"PTRAIN_NUM_BAD_PUNC"
,
// 17
92
"PTRAIN_NUM_BAD_CASE"
,
// 18
93
"PTRAIN_XHEIGHT_CONSISTENCY"
,
// 19
94
"PTRAIN_NUM_BAD_CHAR_TYPE"
,
// 20
95
"PTRAIN_NUM_BAD_SPACING"
,
// 21
96
"PTRAIN_NUM_BAD_FONT"
,
// 22
97
"PTRAIN_RATING_PER_CHAR"
,
// 23
98
};
99
100
// Returns the index of the given feature (by name),
101
// or -1 meaning the feature is unknown.
102
int
ParamsTrainingFeatureByName
(
const
char
*name);
103
104
105
// Entry with features extracted from a single OCR hypothesis for a word.
106
struct
ParamsTrainingHypothesis
{
107
ParamsTrainingHypothesis
() :
cost
(0.0) {
108
memset(
features
, 0,
sizeof
(
features
));
109
}
110
ParamsTrainingHypothesis
(
const
ParamsTrainingHypothesis
&other) {
111
memcpy(
features
, other.
features
,
sizeof
(
features
));
112
str
= other.
str
;
113
cost
= other.
cost
;
114
}
115
ParamsTrainingHypothesis
&
operator=
(
const
ParamsTrainingHypothesis
& other) {
116
memcpy(
features
, other.
features
,
sizeof
(
features
));
117
str
= other.
str
;
118
cost
= other.
cost
;
119
return
*
this
;
120
}
121
float
features
[
PTRAIN_NUM_FEATURE_TYPES
];
122
STRING
str
;
// string corresponding to word hypothesis (for debugging)
123
float
cost
;
// path cost computed by segsearch
124
};
125
126
// A list of hypotheses explored during one run of segmentation search.
127
using
ParamsTrainingHypothesisList
=
GenericVector<ParamsTrainingHypothesis>
;
128
129
// A bundle that accumulates all of the hypothesis lists explored during all
130
// of the runs of segmentation search on a word (e.g. a list of hypotheses
131
// explored on PASS1, PASS2, fix xheight pass, etc).
132
class
ParamsTrainingBundle
{
133
public
:
134
ParamsTrainingBundle
() =
default
;
135
// Starts a new hypothesis list.
136
// Should be called at the beginning of a new run of the segmentation search.
137
void
StartHypothesisList
() {
138
hyp_list_vec
.push_back(
ParamsTrainingHypothesisList
());
139
}
140
// Adds a new ParamsTrainingHypothesis to the current hypothesis list
141
// and returns the reference to the newly added entry.
142
ParamsTrainingHypothesis
&
AddHypothesis
(
143
const
ParamsTrainingHypothesis
&other) {
144
if
(
hyp_list_vec
.empty())
StartHypothesisList
();
145
hyp_list_vec
.back().push_back(
ParamsTrainingHypothesis
(other));
146
return
hyp_list_vec
.back().back();
147
}
148
149
GenericVector<ParamsTrainingHypothesisList>
hyp_list_vec
;
150
};
151
152
}
// namespace tesseract
153
154
#endif // TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
strngs.h
tesseract::PTRAIN_FREQ_MED
Definition:
params_training_featdef.h:58
tesseract::PTRAIN_RATING_PER_CHAR
Definition:
params_training_featdef.h:68
tesseract::PTRAIN_NUM_MED
Definition:
params_training_featdef.h:46
tesseract::ParamsTrainingFeatureByName
int ParamsTrainingFeatureByName(const char *name)
Definition:
params_training_featdef.cpp:26
tesseract::ParamsTrainingBundle::hyp_list_vec
GenericVector< ParamsTrainingHypothesisList > hyp_list_vec
Definition:
params_training_featdef.h:149
tesseract::PTRAIN_DIGITS_SHORT
Definition:
params_training_featdef.h:41
tesseract::PTRAIN_NUM_BAD_FONT
Definition:
params_training_featdef.h:67
tesseract::ParamsTrainingBundle
Definition:
params_training_featdef.h:132
STRING
Definition:
strngs.h:45
tesseract::ParamsTrainingHypothesisList
GenericVector< ParamsTrainingHypothesis > ParamsTrainingHypothesisList
Definition:
params_training_featdef.h:127
tesseract::PTRAIN_SHAPE_COST_PER_CHAR
Definition:
params_training_featdef.h:60
tesseract::PTRAIN_NUM_BAD_CASE
Definition:
params_training_featdef.h:63
tesseract::PTRAIN_FREQ_LONG
Definition:
params_training_featdef.h:59
tesseract::ParamsTrainingBundle::AddHypothesis
ParamsTrainingHypothesis & AddHypothesis(const ParamsTrainingHypothesis &other)
Definition:
params_training_featdef.h:142
tesseract::PTRAIN_NUM_FEATURE_TYPES
Definition:
params_training_featdef.h:70
tesseract::PTRAIN_DICT_SHORT
Definition:
params_training_featdef.h:53
genericvector.h
tesseract::PTRAIN_DICT_LONG
Definition:
params_training_featdef.h:55
tesseract::PTRAIN_NUM_LONG
Definition:
params_training_featdef.h:47
tesseract::PTRAIN_DOC_SHORT
Definition:
params_training_featdef.h:49
tesseract::ParamsTrainingBundle::StartHypothesisList
void StartHypothesisList()
Definition:
params_training_featdef.h:137
tesseract::ParamsTrainingHypothesis::str
STRING str
Definition:
params_training_featdef.h:122
tesseract::ParamsTrainingHypothesis
Definition:
params_training_featdef.h:106
tesseract::PTRAIN_XHEIGHT_CONSISTENCY
Definition:
params_training_featdef.h:64
tesseract::PTRAIN_DOC_MED
Definition:
params_training_featdef.h:50
tesseract::PTRAIN_NUM_BAD_PUNC
Definition:
params_training_featdef.h:62
tesseract
Definition:
baseapi.h:65
tesseract::PTRAIN_DOC_LONG
Definition:
params_training_featdef.h:51
tesseract::ParamsTrainingHypothesis::features
float features[PTRAIN_NUM_FEATURE_TYPES]
Definition:
params_training_featdef.h:121
tesseract::ParamsTrainingBundle::ParamsTrainingBundle
ParamsTrainingBundle()=default
tesseract::kParamsTrainingFeatureType
kParamsTrainingFeatureType
Definition:
params_training_featdef.h:39
GenericVector
Definition:
baseapi.h:40
tesseract::PTRAIN_NGRAM_COST_PER_CHAR
Definition:
params_training_featdef.h:61
tesseract::PTRAIN_DIGITS_LONG
Definition:
params_training_featdef.h:43
tesseract::ParamsTrainingHypothesis::ParamsTrainingHypothesis
ParamsTrainingHypothesis(const ParamsTrainingHypothesis &other)
Definition:
params_training_featdef.h:110
tesseract::PTRAIN_DIGITS_MED
Definition:
params_training_featdef.h:42
tesseract::PTRAIN_DICT_MED
Definition:
params_training_featdef.h:54
tesseract::PTRAIN_NUM_BAD_SPACING
Definition:
params_training_featdef.h:66
tesseract::PTRAIN_NUM_BAD_CHAR_TYPE
Definition:
params_training_featdef.h:65
tesseract::ParamsTrainingHypothesis::ParamsTrainingHypothesis
ParamsTrainingHypothesis()
Definition:
params_training_featdef.h:107
tesseract::PTRAIN_NUM_SHORT
Definition:
params_training_featdef.h:45
tesseract::ParamsTrainingHypothesis::operator=
ParamsTrainingHypothesis & operator=(const ParamsTrainingHypothesis &other)
Definition:
params_training_featdef.h:115
tesseract::ParamsTrainingHypothesis::cost
float cost
Definition:
params_training_featdef.h:123
tesseract::PTRAIN_FREQ_SHORT
Definition:
params_training_featdef.h:57
src
ccstruct
params_training_featdef.h
Generated on Thu Jan 30 2020 14:22:20 for tesseract by
1.8.16