tesseract  5.0.0-alpha-619-ge9db
fixspace.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: fixspace.cpp (Formerly fixspace.c)
3  * Description: Implements a pass over the page res, exploring the alternative
4  * spacing possibilities, trying to use context to improve the
5  * word spacing
6  * Author: Phil Cheatle
7  *
8  * (C) Copyright 1993, Hewlett-Packard Ltd.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 #include "fixspace.h"
22 #include <cstdint> // for INT16_MAX, int16_t, int32_t
23 #include "blobs.h" // for TWERD, TBLOB, TESSLINE
24 #include "boxword.h" // for BoxWord
25 #include "errcode.h" // for ASSERT_HOST
26 #include "normalis.h" // for kBlnXHeight, kBlnBaselineOffset
27 #include <tesseract/ocrclass.h> // for ETEXT_DESC
28 #include "pageres.h" // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
29 #include "params.h" // for IntParam, StringParam, BoolParam, Doub...
30 #include "ratngs.h" // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
31 #include "rect.h" // for TBOX
32 #include "stepblob.h" // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
33 #include <tesseract/strngs.h> // for STRING
34 #include "tesseractclass.h" // for Tesseract, TesseractStats, WordData
35 #include "tessvars.h" // for debug_fp
36 #include "tprintf.h" // for tprintf
37 #include <tesseract/unichar.h> // for UNICHAR_ID
38 #include "unicharset.h" // for UNICHARSET
39 #include "werd.h" // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP
40 
41 class BLOCK;
42 class ROW;
43 
44 #define PERFECT_WERDS 999
45 
46 namespace tesseract {
47 
48 /**********************************************************************
49  * c_blob_comparator()
50  *
51  * Blob comparator used to sort a blob list so that blobs are in increasing
52  * order of left edge.
53  **********************************************************************/
54 
55 static int c_blob_comparator( // sort blobs
56  const void *blob1p, // ptr to ptr to blob1
57  const void *blob2p // ptr to ptr to blob2
58  ) {
59  const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB* const*>(blob1p);
60  const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB* const*>(blob2p);
61 
62  return blob1->bounding_box ().left () - blob2->bounding_box ().left ();
63 }
64 
76  int32_t word_count,
77  PAGE_RES *page_res) {
78  BLOCK_RES_IT block_res_it;
79  ROW_RES_IT row_res_it;
80  WERD_RES_IT word_res_it_from;
81  WERD_RES_IT word_res_it_to;
82  WERD_RES *word_res;
83  WERD_RES_LIST fuzzy_space_words;
84  int16_t new_length;
85  bool prevent_null_wd_fixsp; // DON'T process blobless wds
86  int32_t word_index; // current word
87 
88  block_res_it.set_to_list(&page_res->block_res_list);
89  word_index = 0;
90  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
91  block_res_it.forward()) {
92  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
93  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
94  row_res_it.forward()) {
95  word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
96  while (!word_res_it_from.at_last()) {
97  word_res = word_res_it_from.data();
98  while (!word_res_it_from.at_last() &&
99  !(word_res->combination ||
100  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
101  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
102  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
103  block_res_it.data()->block);
104  word_res = word_res_it_from.forward();
105  word_index++;
106  if (monitor != nullptr) {
107  monitor->ocr_alive = true;
108  monitor->progress = 90 + 5 * word_index / word_count;
109  if (monitor->deadline_exceeded() ||
110  (monitor->cancel != nullptr &&
111  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
112  return;
113  }
114  }
115 
116  if (!word_res_it_from.at_last()) {
117  word_res_it_to = word_res_it_from;
118  prevent_null_wd_fixsp =
119  word_res->word->cblob_list()->empty();
120  if (check_debug_pt(word_res, 60))
121  debug_fix_space_level.set_value(10);
122  word_res_it_to.forward();
123  word_index++;
124  if (monitor != nullptr) {
125  monitor->ocr_alive = true;
126  monitor->progress = 90 + 5 * word_index / word_count;
127  if (monitor->deadline_exceeded() ||
128  (monitor->cancel != nullptr &&
129  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
130  return;
131  }
132  while (!word_res_it_to.at_last () &&
133  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
134  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
135  if (check_debug_pt(word_res, 60))
136  debug_fix_space_level.set_value(10);
137  if (word_res->word->cblob_list()->empty())
138  prevent_null_wd_fixsp = true;
139  word_res = word_res_it_to.forward();
140  }
141  if (check_debug_pt(word_res, 60))
142  debug_fix_space_level.set_value(10);
143  if (word_res->word->cblob_list()->empty())
144  prevent_null_wd_fixsp = true;
145  if (prevent_null_wd_fixsp) {
146  word_res_it_from = word_res_it_to;
147  } else {
148  fuzzy_space_words.assign_to_sublist(&word_res_it_from,
149  &word_res_it_to);
150  fix_fuzzy_space_list(fuzzy_space_words,
151  row_res_it.data()->row,
152  block_res_it.data()->block);
153  new_length = fuzzy_space_words.length();
154  word_res_it_from.add_list_before(&fuzzy_space_words);
155  for (;
156  !word_res_it_from.at_last() && new_length > 0;
157  new_length--) {
158  word_res_it_from.forward();
159  }
160  }
161  if (test_pt)
162  debug_fix_space_level.set_value(0);
163  }
164  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
165  block_res_it.data()->block);
166  // Last word in row
167  }
168  }
169  }
170 }
171 
172 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
173  ROW *row,
174  BLOCK* block) {
175  int16_t best_score;
176  WERD_RES_LIST current_perm;
177  int16_t current_score;
178  bool improved = false;
179 
180  best_score = eval_word_spacing(best_perm); // default score
181  dump_words(best_perm, best_score, 1, improved);
182 
183  if (best_score != PERFECT_WERDS)
184  initialise_search(best_perm, current_perm);
185 
186  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
187  match_current_words(current_perm, row, block);
188  current_score = eval_word_spacing(current_perm);
189  dump_words(current_perm, current_score, 2, improved);
190  if (current_score > best_score) {
191  best_perm.clear();
192  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
193  best_score = current_score;
194  improved = true;
195  }
196  if (current_score < PERFECT_WERDS)
197  transform_to_next_perm(current_perm);
198  }
199  dump_words(best_perm, best_score, 3, improved);
200 }
201 
202 } // namespace tesseract
203 
204 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
205  WERD_RES_IT src_it(&src_list);
206  WERD_RES_IT new_it(&new_list);
207  WERD_RES *src_wd;
208  WERD_RES *new_wd;
209 
210  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
211  src_wd = src_it.data();
212  if (!src_wd->combination) {
213  new_wd = WERD_RES::deep_copy(src_wd);
214  new_wd->combination = false;
215  new_wd->part_of_combo = false;
216  new_it.add_after_then_move(new_wd);
217  }
218  }
219 }
220 
221 
222 namespace tesseract {
223 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
224  BLOCK* block) {
225  WERD_RES_IT word_it(&words);
226  WERD_RES *word;
227  // Since we are not using PAGE_RES to iterate over words, we need to update
228  // prev_word_best_choice_ before calling classify_word_pass2().
229  prev_word_best_choice_ = nullptr;
230  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
231  word = word_it.data();
232  if ((!word->part_of_combo) && (word->box_word == nullptr)) {
233  WordData word_data(block, row, word);
234  SetupWordPassN(2, &word_data);
235  classify_word_and_language(2, nullptr, &word_data);
236  }
238  }
239 }
240 
266 int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
267  WERD_RES_IT word_res_it(&word_res_list);
268  int16_t total_score = 0;
269  int16_t word_count = 0;
270  int16_t done_word_count = 0;
271  int16_t word_len;
272  int16_t i;
273  int16_t offset;
274  WERD_RES *word; // current word
275  int16_t prev_word_score = 0;
276  bool prev_word_done = false;
277  bool prev_char_1 = false; // prev ch a "1/I/l"?
278  bool prev_char_digit = false; // prev ch 2..9 or 0
279  bool current_char_1 = false;
280  bool current_word_ok_so_far;
281  STRING punct_chars = "!\"`',.:;";
282  bool prev_char_punct = false;
283  bool current_char_punct = false;
284  bool word_done = false;
285 
286  do {
287  word = word_res_it.data();
288  word_done = fixspace_thinks_word_done(word);
289  word_count++;
290  if (word->tess_failed) {
291  total_score += prev_word_score;
292  if (prev_word_done)
293  done_word_count++;
294  prev_word_score = 0;
295  prev_char_1 = false;
296  prev_char_digit = false;
297  prev_word_done = false;
298  } else {
299  /*
300  Can we add the prev word score and potentially count this word?
301  Yes IF it didn't end in a 1 when the first char of this word is a digit
302  AND it didn't end in a digit when the first char of this word is a 1
303  */
304  word_len = word->reject_map.length();
305  current_word_ok_so_far = false;
306  if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
307  (prev_char_digit && (
308  (word_done &&
309  word->best_choice->unichar_lengths().c_str()[0] == 1 &&
310  word->best_choice->unichar_string()[0] == '1') ||
311  (!word_done && STRING(conflict_set_I_l_1).contains(
312  word->best_choice->unichar_string()[0])))))) {
313  total_score += prev_word_score;
314  if (prev_word_done)
315  done_word_count++;
316  current_word_ok_so_far = word_done;
317  }
318 
319  if (current_word_ok_so_far) {
320  prev_word_done = true;
321  prev_word_score = word_len;
322  } else {
323  prev_word_done = false;
324  prev_word_score = 0;
325  }
326 
327  /* Add 1 to total score for every joined 1 regardless of context and
328  rejtn */
329  for (i = 0, prev_char_1 = false; i < word_len; i++) {
330  current_char_1 = word->best_choice->unichar_string()[i] == '1';
331  if (prev_char_1 || (current_char_1 && (i > 0)))
332  total_score++;
333  prev_char_1 = current_char_1;
334  }
335 
336  /* Add 1 to total score for every joined punctuation regardless of context
337  and rejtn */
339  for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
340  offset += word->best_choice->unichar_lengths()[i++]) {
341  current_char_punct =
342  punct_chars.contains(word->best_choice->unichar_string()[offset]);
343  if (prev_char_punct || (current_char_punct && i > 0))
344  total_score++;
345  prev_char_punct = current_char_punct;
346  }
347  }
348  prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
349  for (i = 0, offset = 0; i < word_len - 1;
350  offset += word->best_choice->unichar_lengths()[i++]);
351  prev_char_1 =
352  ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
353  || (!word_done && STRING(conflict_set_I_l_1).contains(
354  word->best_choice->unichar_string()[offset])));
355  }
356  /* Find next word */
357  do {
358  word_res_it.forward();
359  } while (word_res_it.data()->part_of_combo);
360  } while (!word_res_it.at_first());
361  total_score += prev_word_score;
362  if (prev_word_done)
363  done_word_count++;
364  if (done_word_count == word_count)
365  return PERFECT_WERDS;
366  else
367  return total_score;
368 }
369 
370 bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
371  int i;
372  int offset;
373 
374  for (i = 0, offset = 0; i < char_position;
375  offset += word->best_choice->unichar_lengths()[i++]);
376  return (
377  word->uch_set->get_isdigit(
378  word->best_choice->unichar_string().c_str() + offset,
379  word->best_choice->unichar_lengths()[i]) ||
380  (word->best_choice->permuter() == NUMBER_PERM &&
382  word->best_choice->unichar_string().c_str()[offset])));
383 }
384 
385 } // namespace tesseract
386 
387 
399 void transform_to_next_perm(WERD_RES_LIST &words) {
400  WERD_RES_IT word_it(&words);
401  WERD_RES_IT prev_word_it(&words);
402  WERD_RES *word;
403  WERD_RES *prev_word;
404  WERD_RES *combo;
405  WERD *copy_word;
406  int16_t prev_right = -INT16_MAX;
407  TBOX box;
408  int16_t gap;
409  int16_t min_gap = INT16_MAX;
410 
411  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
412  word = word_it.data();
413  if (!word->part_of_combo) {
414  box = word->word->bounding_box();
415  if (prev_right > -INT16_MAX) {
416  gap = box.left() - prev_right;
417  if (gap < min_gap)
418  min_gap = gap;
419  }
420  prev_right = box.right();
421  }
422  }
423  if (min_gap < INT16_MAX) {
424  prev_right = -INT16_MAX; // back to start
425  word_it.set_to_list(&words);
426  // Note: we can't use cycle_pt due to inserted combos at start of list.
427  for (; (prev_right == -INT16_MAX) || !word_it.at_first();
428  word_it.forward()) {
429  word = word_it.data();
430  if (!word->part_of_combo) {
431  box = word->word->bounding_box();
432  if (prev_right > -INT16_MAX) {
433  gap = box.left() - prev_right;
434  if (gap <= min_gap) {
435  prev_word = prev_word_it.data();
436  if (prev_word->combination) {
437  combo = prev_word;
438  } else {
439  /* Make a new combination and insert before
440  * the first word being joined. */
441  copy_word = new WERD;
442  *copy_word = *(prev_word->word);
443  // deep copy
444  combo = new WERD_RES(copy_word);
445  combo->combination = true;
446  combo->x_height = prev_word->x_height;
447  prev_word->part_of_combo = true;
448  prev_word_it.add_before_then_move(combo);
449  }
450  combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
451  if (word->combination) {
452  combo->word->join_on(word->word);
453  // Move blobs to combo
454  // old combo no longer needed
455  delete word_it.extract();
456  } else {
457  // Copy current wd to combo
458  combo->copy_on(word);
459  word->part_of_combo = true;
460  }
461  combo->done = false;
462  combo->ClearResults();
463  } else {
464  prev_word_it = word_it; // catch up
465  }
466  }
467  prev_right = box.right();
468  }
469  }
470  } else {
471  words.clear(); // signal termination
472  }
473 }
474 
475 namespace tesseract {
476 void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score,
477  int16_t mode, bool improved) {
478  WERD_RES_IT word_res_it(&perm);
479 
480  if (debug_fix_space_level > 0) {
481  if (mode == 1) {
482  stats_.dump_words_str = "";
483  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
484  word_res_it.forward()) {
485  if (!word_res_it.data()->part_of_combo) {
486  stats_.dump_words_str +=
487  word_res_it.data()->best_choice->unichar_string();
488  stats_.dump_words_str += ' ';
489  }
490  }
491  }
492 
493  if (debug_fix_space_level > 1) {
494  switch (mode) {
495  case 1:
496  tprintf("EXTRACTED (%d): \"", score);
497  break;
498  case 2:
499  tprintf("TESTED (%d): \"", score);
500  break;
501  case 3:
502  tprintf("RETURNED (%d): \"", score);
503  break;
504  }
505 
506  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
507  word_res_it.forward()) {
508  if (!word_res_it.data()->part_of_combo) {
509  tprintf("%s/%1d ",
510  word_res_it.data()->best_choice->unichar_string().c_str(),
511  static_cast<int>(word_res_it.data()->best_choice->permuter()));
512  }
513  }
514  tprintf("\"\n");
515  } else if (improved) {
516  tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());
517  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
518  word_res_it.forward()) {
519  if (!word_res_it.data()->part_of_combo) {
520  tprintf("%s/%1d ",
521  word_res_it.data()->best_choice->unichar_string().c_str(),
522  static_cast<int>(word_res_it.data()->best_choice->permuter()));
523  }
524  }
525  tprintf("\"\n");
526  }
527  }
528 }
529 
531  if (word->done)
532  return true;
533 
534  /*
535  Use all the standard pass 2 conditions for mode 5 in set_done() in
536  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
537  CARE WHETHER WE HAVE of/at on/an etc.
538  */
539  if (fixsp_done_mode > 0 &&
540  (word->tess_accepted ||
541  (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
542  fixsp_done_mode == 3) &&
543  (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&
544  ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
545  (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
546  (word->best_choice->permuter() == USER_DAWG_PERM) ||
547  (word->best_choice->permuter() == NUMBER_PERM))) {
548  return true;
549  } else {
550  return false;
551  }
552 }
553 
554 
562 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
563  BLOCK* block) {
564  WERD_RES *word_res;
565  WERD_RES_LIST sub_word_list;
566  WERD_RES_IT sub_word_list_it(&sub_word_list);
567  int16_t blob_index;
568  int16_t new_length;
569  float junk;
570 
571  word_res = word_res_it.data();
572  if (word_res->word->flag(W_REP_CHAR) ||
573  word_res->combination ||
574  word_res->part_of_combo ||
575  !word_res->word->flag(W_DONT_CHOP))
576  return;
577 
578  blob_index = worst_noise_blob(word_res, &junk);
579  if (blob_index < 0)
580  return;
581 
582  if (debug_fix_space_level > 1) {
583  tprintf("FP fixspace working on \"%s\"\n",
584  word_res->best_choice->unichar_string().c_str());
585  }
586  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
587  sub_word_list_it.add_after_stay_put(word_res_it.extract());
588  fix_noisy_space_list(sub_word_list, row, block);
589  new_length = sub_word_list.length();
590  word_res_it.add_list_before(&sub_word_list);
591  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
592  word_res_it.forward();
593  }
594 }
595 
596 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
597  BLOCK* block) {
598  int16_t best_score;
599  WERD_RES_IT best_perm_it(&best_perm);
600  WERD_RES_LIST current_perm;
601  WERD_RES_IT current_perm_it(&current_perm);
602  WERD_RES *old_word_res;
603  int16_t current_score;
604  bool improved = false;
605 
606  best_score = fp_eval_word_spacing(best_perm); // default score
607 
608  dump_words(best_perm, best_score, 1, improved);
609 
610  old_word_res = best_perm_it.data();
611  // Even deep_copy doesn't copy the underlying WERD unless its combination
612  // flag is true!.
613  old_word_res->combination = true; // Kludge to force deep copy
614  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
615  old_word_res->combination = false; // Undo kludge
616 
617  break_noisiest_blob_word(current_perm);
618 
619  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
620  match_current_words(current_perm, row, block);
621  current_score = fp_eval_word_spacing(current_perm);
622  dump_words(current_perm, current_score, 2, improved);
623  if (current_score > best_score) {
624  best_perm.clear();
625  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
626  best_score = current_score;
627  improved = true;
628  }
629  if (current_score < PERFECT_WERDS) {
630  break_noisiest_blob_word(current_perm);
631  }
632  }
633  dump_words(best_perm, best_score, 3, improved);
634 }
635 
636 
642 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
643  WERD_RES_IT word_it(&words);
644  WERD_RES_IT worst_word_it;
645  float worst_noise_score = 9999;
646  int worst_blob_index = -1; // Noisiest blob of noisiest wd
647  int blob_index; // of wds noisiest blob
648  float noise_score; // of wds noisiest blob
649  WERD_RES *word_res;
650  C_BLOB_IT blob_it;
651  C_BLOB_IT rej_cblob_it;
652  C_BLOB_LIST new_blob_list;
653  C_BLOB_IT new_blob_it;
654  C_BLOB_IT new_rej_cblob_it;
655  WERD *new_word;
656  int16_t start_of_noise_blob;
657  int16_t i;
658 
659  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
660  blob_index = worst_noise_blob(word_it.data(), &noise_score);
661  if (blob_index > -1 && worst_noise_score > noise_score) {
662  worst_noise_score = noise_score;
663  worst_blob_index = blob_index;
664  worst_word_it = word_it;
665  }
666  }
667  if (worst_blob_index < 0) {
668  words.clear(); // signal termination
669  return;
670  }
671 
672  /* Now split the worst_word_it */
673 
674  word_res = worst_word_it.data();
675 
676  /* Move blobs before noise blob to a new bloblist */
677 
678  new_blob_it.set_to_list(&new_blob_list);
679  blob_it.set_to_list(word_res->word->cblob_list());
680  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
681  new_blob_it.add_after_then_move(blob_it.extract());
682  }
683  start_of_noise_blob = blob_it.data()->bounding_box().left();
684  delete blob_it.extract(); // throw out noise blob
685 
686  new_word = new WERD(&new_blob_list, word_res->word);
687  new_word->set_flag(W_EOL, false);
688  word_res->word->set_flag(W_BOL, false);
689  word_res->word->set_blanks(1); // After break
690 
691  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
692  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
693  for (;
694  (!rej_cblob_it.empty() &&
695  (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
696  rej_cblob_it.forward()) {
697  new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
698  }
699 
700  auto* new_word_res = new WERD_RES(new_word);
701  new_word_res->combination = true;
702  worst_word_it.add_before_then_move(new_word_res);
703 
704  word_res->ClearResults();
705 }
706 
707 int16_t Tesseract::worst_noise_blob(WERD_RES *word_res,
708  float *worst_noise_score) {
709  float noise_score[512];
710  int i;
711  int min_noise_blob; // 1st contender
712  int max_noise_blob; // last contender
713  int non_noise_count;
714  int worst_noise_blob; // Worst blob
715  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
716  float non_noise_limit = kBlnXHeight * 0.8;
717 
718  if (word_res->rebuild_word == nullptr)
719  return -1; // Can't handle cube words.
720 
721  // Normalised.
722  int blob_count = word_res->box_word->length();
723  ASSERT_HOST(blob_count <= 512);
724  if (blob_count < 5)
725  return -1; // too short to split
726 
727  /* Get the noise scores for all blobs */
728 
729  #ifndef SECURE_NAMES
730  if (debug_fix_space_level > 5)
731  tprintf("FP fixspace Noise metrics for \"%s\": ",
732  word_res->best_choice->unichar_string().c_str());
733  #endif
734 
735  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
736  TBLOB* blob = word_res->rebuild_word->blobs[i];
737  if (word_res->reject_map[i].accepted())
738  noise_score[i] = non_noise_limit;
739  else
740  noise_score[i] = blob_noise_score(blob);
741 
742  if (debug_fix_space_level > 5)
743  tprintf("%1.1f ", noise_score[i]);
744  }
745  if (debug_fix_space_level > 5)
746  tprintf("\n");
747 
748  /* Now find the worst one which is far enough away from the end of the word */
749 
750  non_noise_count = 0;
751  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
752  if (noise_score[i] >= non_noise_limit) {
753  non_noise_count++;
754  }
755  }
756  if (non_noise_count < fixsp_non_noise_limit)
757  return -1;
758 
759  min_noise_blob = i;
760 
761  non_noise_count = 0;
762  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
763  i--) {
764  if (noise_score[i] >= non_noise_limit) {
765  non_noise_count++;
766  }
767  }
768  if (non_noise_count < fixsp_non_noise_limit)
769  return -1;
770 
771  max_noise_blob = i;
772 
773  if (min_noise_blob > max_noise_blob)
774  return -1;
775 
776  *worst_noise_score = small_limit;
777  worst_noise_blob = -1;
778  for (i = min_noise_blob; i <= max_noise_blob; i++) {
779  if (noise_score[i] < *worst_noise_score) {
780  worst_noise_blob = i;
781  *worst_noise_score = noise_score[i];
782  }
783  }
784  return worst_noise_blob;
785 }
786 
787 float Tesseract::blob_noise_score(TBLOB *blob) {
788  TBOX box; // BB of outline
789  int16_t outline_count = 0;
790  int16_t max_dimension;
791  int16_t largest_outline_dimension = 0;
792 
793  for (TESSLINE* ol = blob->outlines; ol != nullptr; ol= ol->next) {
794  outline_count++;
795  box = ol->bounding_box();
796  if (box.height() > box.width()) {
797  max_dimension = box.height();
798  } else {
799  max_dimension = box.width();
800  }
801 
802  if (largest_outline_dimension < max_dimension)
803  largest_outline_dimension = max_dimension;
804  }
805 
806  if (outline_count > 5) {
807  // penalise LOTS of blobs
808  largest_outline_dimension *= 2;
809  }
810 
811  box = blob->bounding_box();
812  if (box.bottom() > kBlnBaselineOffset * 4 ||
813  box.top() < kBlnBaselineOffset / 2) {
814  // Lax blob is if high or low
815  largest_outline_dimension /= 2;
816  }
817 
818  return largest_outline_dimension;
819 }
820 } // namespace tesseract
821 
822 void fixspace_dbg(WERD_RES *word) {
823  TBOX box = word->word->bounding_box();
824  const bool show_map_detail = false;
825  int16_t i;
826 
827  box.print();
828  tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
829  tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",
830  word->word->cblob_list()->length(),
831  word->rebuild_word->NumBlobs(),
832  word->box_word->length());
833  word->reject_map.print(debug_fp);
834  tprintf("\n");
835  if (show_map_detail) {
836  tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
837  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
838  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
839  word->reject_map[i].full_print(debug_fp);
840  }
841  }
842 
843  tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
844  tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
845 }
846 
847 
856 namespace tesseract {
857 int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
858  WERD_RES_IT word_it(&word_res_list);
859  WERD_RES *word;
860  int16_t score = 0;
861  int16_t i;
862  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
863 
864  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
865  word = word_it.data();
866  if (word->rebuild_word == nullptr)
867  continue; // Can't handle cube words.
868  if (word->done ||
869  word->tess_accepted ||
870  word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
871  word->best_choice->permuter() == FREQ_DAWG_PERM ||
872  word->best_choice->permuter() == USER_DAWG_PERM ||
873  safe_dict_word(word) > 0) {
874  int num_blobs = word->rebuild_word->NumBlobs();
875  UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
876  for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
877  TBLOB* blob = word->rebuild_word->blobs[i];
878  if (word->best_choice->unichar_id(i) == space ||
879  blob_noise_score(blob) < small_limit) {
880  score -= 1; // penalise possibly erroneous non-space
881  } else if (word->reject_map[i].accepted()) {
882  score++;
883  }
884  }
885  }
886  }
887  if (score < 0)
888  score = 0;
889  return score;
890 }
891 
892 } // namespace tesseract
WERD_RES::done
bool done
Definition: pageres.h:299
REJMAP::full_print
void full_print(FILE *fp)
Definition: rejctmap.cpp:332
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
tesseract::Tesseract::dump_words
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:475
strngs.h
tesseract::Tesseract::break_noisiest_blob_word
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:641
normalis.h
tesseract::Tesseract::fix_noisy_space_list
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:595
C_BLOB::bounding_box
TBOX bounding_box() const
Definition: stepblob.cpp:247
tesseract::Tesseract::eval_word_spacing
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:265
WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:266
pageres.h
tessvars.h
WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:116
kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:23
W_REP_CHAR
repeated character
Definition: werd.h:52
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:260
W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:51
tesseractclass.h
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:147
TBLOB::outlines
TESSLINE * outlines
Definition: blobs.h:398
fixspace_dbg
void fixspace_dbg(WERD_RES *word)
Definition: fixspace.cpp:821
params.h
TBOX::print
void print() const
Definition: rect.h:277
TESSLINE
Definition: blobs.h:201
WERD_RES::combination
bool combination
Definition: pageres.h:333
TBOX::top
int16_t top() const
Definition: rect.h:57
STRING
Definition: strngs.h:45
WERD_RES::x_height
float x_height
Definition: pageres.h:310
transform_to_next_perm
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:398
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
tesseract::Wordrec::prev_word_best_choice_
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:476
WERD_RES
Definition: pageres.h:160
rect.h
ETEXT_DESC
Definition: ocrclass.h:95
TESSLINE::next
TESSLINE * next
Definition: blobs.h:279
tesseract::Tesseract::worst_noise_blob
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:706
blobs.h
tesseract::Tesseract::fixsp_small_outlines_size
double fixsp_small_outlines_size
Definition: tesseractclass.h:964
PAGE_RES::block_res_list
BLOCK_RES_LIST block_res_list
Definition: pageres.h:78
tesseract::WordData
Definition: tesseractclass.h:144
C_BLOB
Definition: stepblob.h:36
ETEXT_DESC::ocr_alive
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:103
TBOX::height
int16_t height() const
Definition: rect.h:107
ratngs.h
WERD_RES::tess_failed
bool tess_failed
Definition: pageres.h:289
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
werd.h
PERFECT_WERDS
#define PERFECT_WERDS
Definition: fixspace.cpp:43
WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:94
tesseract::Tesseract::SetupWordPassN
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
BLOCK
Definition: ocrblock.h:28
tesseract::Tesseract::tessedit_prefer_joined_punct
bool tessedit_prefer_joined_punct
Definition: tesseractclass.h:965
tesseract::Tesseract::debug_fix_space_level
int debug_fix_space_level
Definition: tesseractclass.h:967
tesseract::Tesseract::fp_eval_word_spacing
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:856
WERD_RES::deep_copy
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:643
ETEXT_DESC::cancel_this
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:109
REJMAP::length
int32_t length() const
Definition: rejctmap.h:222
W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:54
ETEXT_DESC::progress
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:98
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
tesseract::Tesseract::safe_dict_word
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:605
tesseract::Tesseract::fix_fuzzy_space_list
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:170
tesseract::Tesseract::fixspace_thinks_word_done
bool fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:529
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
unicharset.h
WERD::set_flag
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:117
WERD_RES::ClearResults
void ClearResults()
Definition: pageres.cpp:1100
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
W_EOL
end of line
Definition: werd.h:47
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:457
stepblob.h
TBOX::width
int16_t width() const
Definition: rect.h:114
WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:297
ETEXT_DESC::deadline_exceeded
bool deadline_exceeded() const
Definition: ocrclass.h:129
tesseract::Tesseract::blob_noise_score
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:786
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
REJMAP::reject_count
int16_t reject_count()
Definition: rejctmap.h:228
WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:288
tesseract
Definition: baseapi.h:65
tesseract::TesseractStats::dict_words
int32_t dict_words
Definition: tesseractclass.h:134
tesseract::TesseractStats::dump_words_str
STRING dump_words_str
Definition: tesseractclass.h:135
PAGE_RES
Definition: pageres.h:73
tesseract::Tesseract::classify_word_and_language
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1318
tprintf.h
fixspace.h
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:466
WERD_RES::copy_on
void copy_on(WERD_RES *word_res)
Definition: pageres.h:654
STRING::contains
bool contains(char c) const
Definition: strngs.cpp:183
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
W_FUZZY_SP
fuzzy space
Definition: werd.h:53
ocrclass.h
tesseract::BoxWord::length
int length() const
Definition: boxword.h:82
TBLOB
Definition: blobs.h:282
tesseract::Tesseract::numeric_punctuation
char * numeric_punctuation
Definition: tesseractclass.h:968
WERD
Definition: werd.h:55
TBOX::left
int16_t left() const
Definition: rect.h:71
unichar.h
ROW
Definition: ocrrow.h:35
ETEXT_DESC::cancel
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:105
debug_fp
FILE * debug_fp
Definition: tessvars.cpp:23
TBOX::right
int16_t right() const
Definition: rect.h:78
WERD_RES::part_of_combo
bool part_of_combo
Definition: pageres.h:334
tesseract::Tesseract::fix_fuzzy_spaces
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:73
tesseract::Tesseract::conflict_set_I_l_1
char * conflict_set_I_l_1
Definition: tesseractclass.h:1041
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
errcode.h
WERD::set_blanks
void set_blanks(uint8_t new_blanks)
Definition: werd.h:101
WERD::join_on
void join_on(WERD *other)
Definition: werd.cpp:198
WERD_RES::word
WERD * word
Definition: pageres.h:180
tesseract::Tesseract::digit_or_numeric_punct
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:369
REJMAP::print
void print(FILE *fp)
Definition: rejctmap.cpp:320
initialise_search
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:203
tesseract::Tesseract::fixsp_non_noise_limit
int fixsp_non_noise_limit
Definition: tesseractclass.h:963
tesseract::Tesseract::test_pt
bool test_pt
Definition: tesseractclass.h:887
WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:536
FREQ_DAWG_PERM
Definition: ratngs.h:242
boxword.h
WERD::rej_cblob_list
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:89
tesseract::Tesseract::match_current_words
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:222
tesseract::Tesseract::fix_sp_fp_word
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:561
kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:24
TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:446
W_BOL
start of line
Definition: werd.h:46
NUMBER_PERM
Definition: ratngs.h:237
USER_DAWG_PERM
Definition: ratngs.h:241
tesseract::Tesseract::fixsp_done_mode
int fixsp_done_mode
Definition: tesseractclass.h:966
tesseract::Tesseract::check_debug_pt
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1848
TBOX
Definition: rect.h:33