tesseract  5.0.0-alpha-619-ge9db
params_training_featdef.h
Go to the documentation of this file.
1 // File: params_training_featdef.h
3 // Description: Feature definitions for params training.
4 // Author: Rika Antonova
5 // Created: Mon Nov 28 11:26:42 PDT 2011
6 //
7 // (C) Copyright 2011, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
21 #define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
22 
24 #include <tesseract/strngs.h>
25 
26 namespace tesseract {
27 
28 // Maximum number of unichars in the small and medium sized words
29 static const int kMaxSmallWordUnichars = 3;
30 static const int kMaxMediumWordUnichars = 6;
31 
32 // Raw features extracted from a single OCR hypothesis.
33 // The features are normalized (by outline length or number of unichars as
34 // appropriate) real-valued quantities with unbounded range and
35 // unknown distribution.
36 // Normalization / binarization of these features is done at a later stage.
37 // Note: when adding new fields to this enum make sure to modify
38 // kParamsTrainingFeatureTypeName
40  // Digits
44  // Number or pattern (NUMBER_PERM, USER_PATTERN_PERM)
48  // Document word (DOC_DAWG_PERM)
52  // Word (SYSTEM_DAWG_PERM, USER_DAWG_PERM, COMPOUND_PERM)
56  // Frequent word (FREQ_DAWG_PERM)
69 
71 };
72 
73 static const char * const kParamsTrainingFeatureTypeName[] = {
74  "PTRAIN_DIGITS_SHORT", // 0
75  "PTRAIN_DIGITS_MED", // 1
76  "PTRAIN_DIGITS_LONG", // 2
77  "PTRAIN_NUM_SHORT", // 3
78  "PTRAIN_NUM_MED", // 4
79  "PTRAIN_NUM_LONG", // 5
80  "PTRAIN_DOC_SHORT", // 6
81  "PTRAIN_DOC_MED", // 7
82  "PTRAIN_DOC_LONG", // 8
83  "PTRAIN_DICT_SHORT", // 9
84  "PTRAIN_DICT_MED", // 10
85  "PTRAIN_DICT_LONG", // 11
86  "PTRAIN_FREQ_SHORT", // 12
87  "PTRAIN_FREQ_MED", // 13
88  "PTRAIN_FREQ_LONG", // 14
89  "PTRAIN_SHAPE_COST_PER_CHAR", // 15
90  "PTRAIN_NGRAM_COST_PER_CHAR", // 16
91  "PTRAIN_NUM_BAD_PUNC", // 17
92  "PTRAIN_NUM_BAD_CASE", // 18
93  "PTRAIN_XHEIGHT_CONSISTENCY", // 19
94  "PTRAIN_NUM_BAD_CHAR_TYPE", // 20
95  "PTRAIN_NUM_BAD_SPACING", // 21
96  "PTRAIN_NUM_BAD_FONT", // 22
97  "PTRAIN_RATING_PER_CHAR", // 23
98 };
99 
100 // Returns the index of the given feature (by name),
101 // or -1 meaning the feature is unknown.
102 int ParamsTrainingFeatureByName(const char *name);
103 
104 
105 // Entry with features extracted from a single OCR hypothesis for a word.
108  memset(features, 0, sizeof(features));
109  }
111  memcpy(features, other.features, sizeof(features));
112  str = other.str;
113  cost = other.cost;
114  }
116  memcpy(features, other.features, sizeof(features));
117  str = other.str;
118  cost = other.cost;
119  return *this;
120  }
122  STRING str; // string corresponding to word hypothesis (for debugging)
123  float cost; // path cost computed by segsearch
124 };
125 
126 // A list of hypotheses explored during one run of segmentation search.
128 
129 // A bundle that accumulates all of the hypothesis lists explored during all
130 // of the runs of segmentation search on a word (e.g. a list of hypotheses
131 // explored on PASS1, PASS2, fix xheight pass, etc).
133  public:
134  ParamsTrainingBundle() = default;
135  // Starts a new hypothesis list.
136  // Should be called at the beginning of a new run of the segmentation search.
139  }
140  // Adds a new ParamsTrainingHypothesis to the current hypothesis list
141  // and returns the reference to the newly added entry.
143  const ParamsTrainingHypothesis &other) {
144  if (hyp_list_vec.empty()) StartHypothesisList();
145  hyp_list_vec.back().push_back(ParamsTrainingHypothesis(other));
146  return hyp_list_vec.back().back();
147  }
148 
150 };
151 
152 } // namespace tesseract
153 
154 #endif // TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
strngs.h
tesseract::PTRAIN_FREQ_MED
Definition: params_training_featdef.h:58
tesseract::PTRAIN_RATING_PER_CHAR
Definition: params_training_featdef.h:68
tesseract::PTRAIN_NUM_MED
Definition: params_training_featdef.h:46
tesseract::ParamsTrainingFeatureByName
int ParamsTrainingFeatureByName(const char *name)
Definition: params_training_featdef.cpp:26
tesseract::ParamsTrainingBundle::hyp_list_vec
GenericVector< ParamsTrainingHypothesisList > hyp_list_vec
Definition: params_training_featdef.h:149
tesseract::PTRAIN_DIGITS_SHORT
Definition: params_training_featdef.h:41
tesseract::PTRAIN_NUM_BAD_FONT
Definition: params_training_featdef.h:67
tesseract::ParamsTrainingBundle
Definition: params_training_featdef.h:132
STRING
Definition: strngs.h:45
tesseract::ParamsTrainingHypothesisList
GenericVector< ParamsTrainingHypothesis > ParamsTrainingHypothesisList
Definition: params_training_featdef.h:127
tesseract::PTRAIN_SHAPE_COST_PER_CHAR
Definition: params_training_featdef.h:60
tesseract::PTRAIN_NUM_BAD_CASE
Definition: params_training_featdef.h:63
tesseract::PTRAIN_FREQ_LONG
Definition: params_training_featdef.h:59
tesseract::ParamsTrainingBundle::AddHypothesis
ParamsTrainingHypothesis & AddHypothesis(const ParamsTrainingHypothesis &other)
Definition: params_training_featdef.h:142
tesseract::PTRAIN_NUM_FEATURE_TYPES
Definition: params_training_featdef.h:70
tesseract::PTRAIN_DICT_SHORT
Definition: params_training_featdef.h:53
genericvector.h
tesseract::PTRAIN_DICT_LONG
Definition: params_training_featdef.h:55
tesseract::PTRAIN_NUM_LONG
Definition: params_training_featdef.h:47
tesseract::PTRAIN_DOC_SHORT
Definition: params_training_featdef.h:49
tesseract::ParamsTrainingBundle::StartHypothesisList
void StartHypothesisList()
Definition: params_training_featdef.h:137
tesseract::ParamsTrainingHypothesis::str
STRING str
Definition: params_training_featdef.h:122
tesseract::ParamsTrainingHypothesis
Definition: params_training_featdef.h:106
tesseract::PTRAIN_XHEIGHT_CONSISTENCY
Definition: params_training_featdef.h:64
tesseract::PTRAIN_DOC_MED
Definition: params_training_featdef.h:50
tesseract::PTRAIN_NUM_BAD_PUNC
Definition: params_training_featdef.h:62
tesseract
Definition: baseapi.h:65
tesseract::PTRAIN_DOC_LONG
Definition: params_training_featdef.h:51
tesseract::ParamsTrainingHypothesis::features
float features[PTRAIN_NUM_FEATURE_TYPES]
Definition: params_training_featdef.h:121
tesseract::ParamsTrainingBundle::ParamsTrainingBundle
ParamsTrainingBundle()=default
tesseract::kParamsTrainingFeatureType
kParamsTrainingFeatureType
Definition: params_training_featdef.h:39
GenericVector
Definition: baseapi.h:40
tesseract::PTRAIN_NGRAM_COST_PER_CHAR
Definition: params_training_featdef.h:61
tesseract::PTRAIN_DIGITS_LONG
Definition: params_training_featdef.h:43
tesseract::ParamsTrainingHypothesis::ParamsTrainingHypothesis
ParamsTrainingHypothesis(const ParamsTrainingHypothesis &other)
Definition: params_training_featdef.h:110
tesseract::PTRAIN_DIGITS_MED
Definition: params_training_featdef.h:42
tesseract::PTRAIN_DICT_MED
Definition: params_training_featdef.h:54
tesseract::PTRAIN_NUM_BAD_SPACING
Definition: params_training_featdef.h:66
tesseract::PTRAIN_NUM_BAD_CHAR_TYPE
Definition: params_training_featdef.h:65
tesseract::ParamsTrainingHypothesis::ParamsTrainingHypothesis
ParamsTrainingHypothesis()
Definition: params_training_featdef.h:107
tesseract::PTRAIN_NUM_SHORT
Definition: params_training_featdef.h:45
tesseract::ParamsTrainingHypothesis::operator=
ParamsTrainingHypothesis & operator=(const ParamsTrainingHypothesis &other)
Definition: params_training_featdef.h:115
tesseract::ParamsTrainingHypothesis::cost
float cost
Definition: params_training_featdef.h:123
tesseract::PTRAIN_FREQ_SHORT
Definition: params_training_featdef.h:57