72 const char* truth_str,
const TBOX& word_box) {
74 truth_has_char_boxes_ =
false;
78 unicharset.
encode_string(truth_str,
false, &encoding, &lengths,
nullptr);
80 for (
int i = 0; i < encoding.
size(); total_length += lengths[i++]) {
81 STRING uch(truth_str + total_length);
92 const char* char_str,
const TBOX& char_box) {
93 STRING symbol_str(char_str);
95 if (
id != INVALID_UNICHAR_ID) {
97 if (normed_uch.length() > 0) symbol_str = normed_uch;
99 int length = truth_word_.
length();
103 truth_has_char_boxes_ =
true;
104 else if (truth_word_.
BlobBox(length - 1) == char_box)
105 truth_has_char_boxes_ =
false;
112 truth_has_char_boxes_ =
false;
117 if (word_choice ==
nullptr)
return false;
120 for (
int i = 0; i < word_choice->
length(); ++i) {
125 return truth_str == normed_choice_str;
131 (*debug) +=
"Truth ";
132 for (
int i = 0; i < this->truth_text_.
length(); ++i) {
133 (*debug) += this->truth_text_[i];
135 if (!this->truth_has_char_boxes_) (*debug) +=
" (no char boxes)";
136 if (choice !=
nullptr) {
137 (*debug) +=
" Choice ";
140 (*debug) += choice_str;
152 norm_box_tolerance_ = kBlamerBoxTolerance * denorm.
x_scale();
157 for (
int b = 0; b < truth_word_.
length(); ++b) {
159 topleft.
x = box.
left();
160 topleft.
y = box.
top();
165 TBOX norm_box(norm_topleft.
x, norm_botright.
y,
166 norm_botright.
x, norm_topleft.
y);
180 int begin2_truth_index = -1;
182 truth_has_char_boxes_) {
183 debug_str =
"Looking for truth split at";
186 debug_str +=
"\nnorm_truth_word boxes:\n";
187 if (norm_truth_word_.
length() > 1) {
189 for (b = 1; b < norm_truth_word_.
length(); ++b) {
191 if ((abs(word1_right - norm_truth_word_.
BlobBox(b - 1).
right()) <
192 norm_box_tolerance_) &&
193 (abs(word2_left - norm_truth_word_.
BlobBox(b).
left()) <
194 norm_box_tolerance_)) {
195 begin2_truth_index = b;
196 debug_str +=
"Split found";
205 if (begin2_truth_index > 0) {
206 bundle1->truth_has_char_boxes_ =
true;
207 bundle1->norm_box_tolerance_ = norm_box_tolerance_;
208 bundle2->truth_has_char_boxes_ =
true;
209 bundle2->norm_box_tolerance_ = norm_box_tolerance_;
211 for (b = 0; b < norm_truth_word_.
length(); ++b) {
212 if (b == begin2_truth_index) curr_bb = bundle2;
215 curr_bb->truth_text_.
push_back(truth_text_[b]);
221 debug_str +=
"Truth split not found";
222 debug_str += truth_has_char_boxes_ ?
223 "\n" :
" (no truth char boxes)\n";
235 if (bundle1.incorrect_result_reason_ !=
IRR_CORRECT &&
238 debug_str +=
"Blame from part 1: ";
239 debug_str += bundle1.debug_;
240 irr = bundle1.incorrect_result_reason_;
242 if (bundle2.incorrect_result_reason_ !=
IRR_CORRECT &&
245 debug_str +=
"Blame from part 2: ";
246 debug_str += bundle2.debug_;
248 irr = bundle2.incorrect_result_reason_;
249 }
else if (irr != bundle2.incorrect_result_reason_) {
253 incorrect_result_reason_ = irr;
255 SetBlame(irr, debug_str,
nullptr,
debug);
263 const TBOX& blob_box,
264 const BLOB_CHOICE_LIST& choices,
266 if (!truth_has_char_boxes_ ||
270 for (
int b = 0; b < norm_truth_word_.
length(); ++b) {
271 const TBOX &truth_box = norm_truth_word_.
BlobBox(b);
277 bool incorrect_adapted =
false;
278 UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;
279 const char *truth_str = truth_text_[b].string();
282 BLOB_CHOICE_IT choices_it(const_cast<BLOB_CHOICE_LIST*>(&choices));
283 for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
284 choices_it.forward()) {
291 incorrect_adapted =
true;
296 STRING debug_str =
"unichar ";
297 debug_str += truth_str;
298 debug_str +=
" not found in classification list";
300 }
else if (incorrect_adapted) {
301 STRING debug_str =
"better rating for adapted ";
303 debug_str +=
" than for correct ";
304 debug_str += truth_str;
316 if (
NoTruth() || !truth_has_char_boxes_ ||
321 bool missing_chop =
false;
325 int16_t truth_x = -1;
326 while (box_index < truth_word_.
length() && blob_index < num_blobs) {
333 truth_x + norm_box_tolerance_) {
340 if (missing_chop || box_index < norm_truth_word_.
length()) {
343 debug_str.
add_str_int(
"Detected missing chop (tolerance=",
344 norm_box_tolerance_);
345 debug_str +=
") at Bounding Box=";
348 debug_str.
add_str_int(
"\nNo chop for truth at x=", truth_x);
351 norm_truth_word_.
length() - box_index);
352 debug_str +=
" truth box(es)";
354 debug_str +=
"\nMaximally chopped word boxes:\n";
355 for (blob_index = 0; blob_index < num_blobs; ++blob_index) {
360 debug_str +=
"Truth bounding boxes:\n";
361 for (box_index = 0; box_index < norm_truth_word_.
length(); ++box_index) {
376 const UNICHARSET& unicharset,
bool valid_permuter,
bool debug) {
377 if (valid_permuter) {
379 best_choice_is_dict_and_top_choice_ =
true;
384 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
385 blob_choice_it.forward()) {
386 if (!(unicharset.
get_fragment(blob_choice_it.data()->unichar_id()))) {
387 first_choice = blob_choice_it.data();
393 best_choice_is_dict_and_top_choice_ =
false;
399 if (best_choice_is_dict_and_top_choice_) {
400 debug_str =
"Best choice is: incorrect, top choice, dictionary word";
401 debug_str +=
" with permuter ";
404 debug_str =
"Classifier/Old LM tradeoff is to blame";
414 if (incorrect_result_reason_ !=
IRR_CORRECT || !truth_has_char_boxes_)
418 debug_str +=
"Blamer computing correct_segmentation_cols\n";
419 int curr_box_col = 0;
420 int next_box_col = 0;
421 int num_blobs =
word->NumBlobs();
422 if (num_blobs == 0)
return;
424 int16_t next_box_x =
word->blobs[blob_index]->bounding_box().right();
425 for (
int truth_idx = 0; blob_index < num_blobs &&
426 truth_idx < norm_truth_word_.
length();
429 int16_t curr_box_x = next_box_x;
430 if (blob_index + 1 < num_blobs)
431 next_box_x =
word->blobs[blob_index + 1]->bounding_box().right();
432 int16_t truth_x = norm_truth_word_.
BlobBox(truth_idx).
right();
433 debug_str.
add_str_int(
"Box x coord vs. truth: ", curr_box_x);
436 if (curr_box_x > (truth_x + norm_box_tolerance_)) {
438 }
else if (curr_box_x >= truth_x - norm_box_tolerance_ &&
439 (blob_index + 1 >= num_blobs ||
440 next_box_x > truth_x + norm_box_tolerance_)) {
441 correct_segmentation_cols_.
push_back(curr_box_col);
442 correct_segmentation_rows_.
push_back(next_box_col-1);
447 curr_box_col = next_box_col;
450 if (blob_index < num_blobs ||
451 correct_segmentation_cols_.
length() != norm_truth_word_.
length()) {
452 debug_str.
add_str_int(
"Blamer failed to find correct segmentation" 453 " (tolerance=", norm_box_tolerance_);
454 if (blob_index >= num_blobs) debug_str +=
" blob == nullptr";
460 correct_segmentation_cols_.
clear();
461 correct_segmentation_rows_.
clear();
468 !segsearch_is_looking_for_blame_ &&
469 truth_has_char_boxes_ &&
480 bool debug,
STRING *debug_str,
482 segsearch_is_looking_for_blame_ =
true;
484 tprintf(
"segsearch starting to look for blame\n");
488 *debug_str +=
"Correct segmentation:\n";
489 for (
int idx = 0; idx < correct_segmentation_cols_.
length(); ++idx) {
490 debug_str->
add_str_int(
"col=", correct_segmentation_cols_[idx]);
491 debug_str->
add_str_int(
" row=", correct_segmentation_rows_[idx]);
493 if (!ratings->
Classified(correct_segmentation_cols_[idx],
494 correct_segmentation_rows_[idx],
496 !cb->
Run(correct_segmentation_cols_[idx],
497 correct_segmentation_rows_[idx])) {
498 segsearch_is_looking_for_blame_ =
false;
499 *debug_str +=
"\nFailed to insert pain point\n";
507 return segsearch_is_looking_for_blame_;
512 bool debug,
STRING *debug_str) {
524 if (segsearch_is_looking_for_blame_) {
525 segsearch_is_looking_for_blame_ =
false;
526 if (best_choice_is_dict_and_top_choice_) {
527 *debug_str =
"Best choice is: incorrect, top choice, dictionary word";
528 *debug_str +=
" with permuter ";
531 }
else if (best_correctly_segmented_rating_ <
533 *debug_str +=
"Correct segmentation state was not explored";
536 if (best_correctly_segmented_rating_ >=
538 *debug_str +=
"Correct segmentation paths were pruned by LM\n";
541 best_correctly_segmented_rating_);
564 STRING debug_str =
"Choice is incorrect after recognition";
583 misadaption_debug_ =
"misadapt to word (";
585 misadaption_debug_ +=
"): ";
bool GuidedSegsearchStillGoing() const
const char kBlameNoTruth[]
const STRING & debug() const
void NormTransform(const DENORM *first_norm, const TPOINT &pt, TPOINT *transformed) const
static const float kBadRating
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1, BlamerBundle *bundle2) const
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
const char kBlameClassifier[]
void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box, const BLOB_CHOICE_LIST &choices, bool debug)
const char * string() const
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
const char kBlameNoTruthSplit[]
const UNICHARSET * unicharset() const
static const char * permuter_name(uint8_t permuter)
void SetChopperBlame(const WERD_RES *word, bool debug)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
void StartHypothesisList()
const char kBlameClassLMTradeoff[]
static const char * IncorrectReasonName(IncorrectResultReason irr)
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
const TBOX & BlobBox(int index) const
const char kBlameSegsearchHeur[]
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, STRING *debug_str, TessResultCallback2< bool, int, int > *pp_cb)
const char kBlameClassOldLMTradeoff[]
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
const char kBlameCorrect[]
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box)
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
const char kBlamePageLayout[]
void add_str_double(const char *str, double number)
STRING TruthString() const
void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box)
UNICHAR_ID unichar_id(int index) const
const char kBlameChopper[]
DLLSYM void tprintf(const char *format,...)
TBOX bounding_box() const
void truncate_at(int32_t index)
void FillDebugString(const STRING &msg, const WERD_CHOICE *choice, STRING *debug)
bool x_almost_equal(const TBOX &box, int tolerance) const
GenericVector< TBLOB * > blobs
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
void add_str_int(const char *str, int number)
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
void SetupCorrectSegmentation(const TWERD *word, bool debug)
bool Classified(int col, int row, int wildcard_id) const
const char kBlameSegsearchPP[]
const char * id_to_unichar(UNICHAR_ID id) const
BlamerBundle * blamer_bundle
static void LastChanceBlame(bool debug, WERD_RES *word)
const char * IncorrectReason() const
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, STRING *debug_str)
const char *const kIncorrectResultReasonNames[]
UNICHAR_ID unichar_id() const
const char kBlameAdaption[]
WERD_CHOICE * best_choice
void SetupNormTruthWord(const DENORM &denorm)
void InsertBox(int index, const TBOX &box)
const char kBlameUnknown[]
void print_to_str(STRING *str) const