40 cube_cntxt_ = cube_cntxt;
45 if (combiner_net_ !=
NULL) {
56 string net_file_name = data_path + cube_cntxt_->
Lang() +
60 FILE *fp = fopen(net_file_name.c_str(),
"rb");
68 if (combiner_net_ ==
NULL) {
69 tprintf(
"Could not read combiner net file %s", net_file_name.c_str());
71 }
else if (combiner_net_->
out_cnt() != 2) {
72 tprintf(
"Invalid combiner net file %s! Output count != 2\n",
73 net_file_name.c_str());
83 string TesseractCubeCombiner::NormalizeString(
const string &str,
91 for (
int idx = 0; idx < str32.length(); idx++) {
93 if (!remove_punc || iswpunct(str32[idx]) == 0) {
96 if (norm_case && iswalpha(norm_char)) {
97 norm_char = towlower(norm_char);
99 new_str32.push_back(norm_char);
109 int TesseractCubeCombiner::CompareStrings(
const string &str1,
113 if (!ignore_punc && !ignore_case) {
114 return str1.compare(str2);
116 string norm_str1 = NormalizeString(str1, ignore_punc, ignore_case);
117 string norm_str2 = NormalizeString(str2, ignore_punc, ignore_case);
118 return norm_str1.compare(norm_str2);
134 vector<double> *features,
138 if (cube_alt_list ==
NULL || cube_alt_list->
AltCount() <= 0)
142 char_32 *cube_best_str32 = cube_alt_list->
Alt(0);
145 string cube_best_str;
146 int cube_best_cost = cube_alt_list->
AltCost(0);
147 int cube_best_bigram_cost = 0;
148 bool cube_best_bigram_cost_valid =
true;
150 cube_best_bigram_cost = cube_cntxt_->
Bigrams()->
153 cube_best_bigram_cost_valid =
false;
161 *agreement = (tess_str.compare(cube_best_str) == 0);
165 string cube_next_best_str;
167 if (cube_alt_list->
AltCount() > 1) {
168 cube_next_best_str32 = cube_alt_list->
Alt(1);
169 if (cube_next_best_str32 ==
NULL ||
173 cube_next_best_cost = cube_alt_list->
AltCost(1);
178 for (tess_rank = 0; tess_rank < cube_alt_list->
AltCount(); tess_rank++) {
181 if (alt_str == tess_str)
187 int tess_cost = cube_obj->
WordCost(tess_str.c_str());
189 int tess_bigram_cost = 0;
190 int tess_bigram_cost_valid =
true;
192 tess_bigram_cost = cube_cntxt_->
Bigrams()->
195 tess_bigram_cost_valid =
false;
198 features->push_back(tess_confidence);
200 features->push_back(tess_cost);
202 features->push_back(tess_rank);
204 features->push_back(tess_str.length());
206 features->push_back(
ValidWord(tess_str));
207 if (tess_bigram_cost_valid) {
209 features->push_back(tess_bigram_cost);
212 features->push_back(cube_best_cost);
214 features->push_back(cube_next_best_cost);
216 features->push_back(cube_best_str.length());
218 features->push_back(
ValidWord(cube_best_str));
219 if (cube_best_bigram_cost_valid) {
221 features->push_back(cube_best_bigram_cost);
224 int compare_nocase_punc = CompareStrings(cube_best_str,
225 tess_str,
false,
true);
226 features->push_back(compare_nocase_punc == 0);
228 int compare_case_nopunc = CompareStrings(cube_best_str,
229 tess_str,
true,
false);
230 features->push_back(compare_case_nopunc == 0);
232 int compare_nocase_nopunc = CompareStrings(cube_best_str,
233 tess_str,
true,
true);
234 features->push_back(compare_nocase_nopunc == 0);
246 if (combiner_net_ ==
NULL || cube_obj ==
NULL) {
247 tprintf(
"Cube WARNING (TesseractCubeCombiner::CombineResults): "
248 "Cube objects not initialized; defaulting to Tesseract\n");
255 if (cube_alt_list ==
NULL)
257 if (cube_alt_list ==
NULL || cube_alt_list->
AltCount() <= 0) {
258 tprintf(
"Cube WARNING (TesseractCubeCombiner::CombineResults): "
259 "Cube returned no results; defaulting to Tesseract\n");
275 if (combiner_net_ ==
NULL || cube_obj ==
NULL ||
276 cube_alt_list ==
NULL || cube_alt_list->
AltCount() <= 0) {
277 tprintf(
"Cube WARNING (TesseractCubeCombiner::CombineResults): "
278 "Cube result cannot be retrieved; defaulting to Tesseract\n");
286 int tess_confidence =
MIN(100,
MAX(1, static_cast<int>(
291 vector<double> features;
294 cube_obj, cube_alt_list,
295 &features, &agreement);
296 if (!combiner_success || agreement)
302 if (!combiner_net_->
FeedForward(&features[0], net_out))
int WordCost(const char *str)
float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj)
tesseract::Tesseract * TesseractObject() const
WERD_CHOICE * best_choice
int AltCost(int alt_idx) const
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
basic_string< char_32 > string_32
bool ValidWord(const string &str)
WordAltList * RecognizeWord(LangModel *lang_mod=NULL)
virtual ~TesseractCubeCombiner()
const STRING & unichar_string() const
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
WordAltList * AlternateList() const
bool FeedForward(const Type *inputs, Type *outputs)
CharBigrams * Bigrams() const
char_32 * Alt(int alt_idx)
static int StrLen(const char_32 *str)
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
TesseractCubeCombiner(CubeRecoContext *cube_cntxt)
CharSet * CharacterSet() const
bool ComputeCombinerFeatures(const string &tess_res, int tess_confidence, CubeObject *cube_obj, WordAltList *cube_alt_list, vector< double > *features, bool *agreement)
const char * string() const
bool GetDataFilePath(string *path) const
const string & Lang() const
static NeuralNet * FromFile(const string file_name)