tesseract  4.0.0-1-g2a2b
fixspace.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: fixspace.cpp (Formerly fixspace.c)
3  * Description: Implements a pass over the page res, exploring the alternative
4  * spacing possibilities, trying to use context to improve the
5  * word spacing
6  * Author: Phil Cheatle
7  * Created: Thu Oct 21 11:38:43 BST 1993
8  *
9  * (C) Copyright 1993, Hewlett-Packard Ltd.
10  ** Licensed under the Apache License, Version 2.0 (the "License");
11  ** you may not use this file except in compliance with the License.
12  ** You may obtain a copy of the License at
13  ** http://www.apache.org/licenses/LICENSE-2.0
14  ** Unless required by applicable law or agreed to in writing, software
15  ** distributed under the License is distributed on an "AS IS" BASIS,
16  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  ** See the License for the specific language governing permissions and
18  ** limitations under the License.
19  *
20  **********************************************************************/
21 
22 #include "fixspace.h"
23 #include <cstdint> // for INT16_MAX, int16_t, int32_t
24 #include "blobs.h" // for TWERD, TBLOB, TESSLINE
25 #include "boxword.h" // for BoxWord
26 #include "errcode.h" // for ASSERT_HOST
27 #include "host.h" // for FALSE, TRUE
28 #include "normalis.h" // for kBlnXHeight, kBlnBaselineOffset
29 #include "ocrclass.h" // for ETEXT_DESC
30 #include "pageres.h" // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
31 #include "params.h" // for IntParam, StringParam, BoolParam, Doub...
32 #include "ratngs.h" // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
33 #include "rect.h" // for TBOX
34 #include "stepblob.h" // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
35 #include "strngs.h" // for STRING
36 #include "tesseractclass.h" // for Tesseract, TesseractStats, WordData
37 #include "tessvars.h" // for debug_fp
38 #include "tprintf.h" // for tprintf
39 #include "unichar.h" // for UNICHAR_ID
40 #include "unicharset.h" // for UNICHARSET
41 #include "werd.h" // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP
42 
43 class BLOCK;
44 class ROW;
45 
46 #define PERFECT_WERDS 999
47 #define MAXSPACING 128 /*max expected spacing in pix */
48 
49 namespace tesseract {
50 
51 /**********************************************************************
52  * c_blob_comparator()
53  *
54  * Blob comparator used to sort a blob list so that blobs are in increasing
55  * order of left edge.
56  **********************************************************************/
57 
58 static int c_blob_comparator( // sort blobs
59  const void *blob1p, // ptr to ptr to blob1
60  const void *blob2p // ptr to ptr to blob2
61  ) {
62  const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB* const*>(blob1p);
63  const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB* const*>(blob2p);
64 
65  return blob1->bounding_box ().left () - blob2->bounding_box ().left ();
66 }
67 
79  int32_t word_count,
80  PAGE_RES *page_res) {
81  BLOCK_RES_IT block_res_it;
82  ROW_RES_IT row_res_it;
83  WERD_RES_IT word_res_it_from;
84  WERD_RES_IT word_res_it_to;
85  WERD_RES *word_res;
86  WERD_RES_LIST fuzzy_space_words;
87  int16_t new_length;
88  bool prevent_null_wd_fixsp; // DON'T process blobless wds
89  int32_t word_index; // current word
90 
91  block_res_it.set_to_list(&page_res->block_res_list);
92  word_index = 0;
93  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
94  block_res_it.forward()) {
95  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
96  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
97  row_res_it.forward()) {
98  word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
99  while (!word_res_it_from.at_last()) {
100  word_res = word_res_it_from.data();
101  while (!word_res_it_from.at_last() &&
102  !(word_res->combination ||
103  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
104  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
105  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
106  block_res_it.data()->block);
107  word_res = word_res_it_from.forward();
108  word_index++;
109  if (monitor != nullptr) {
110  monitor->ocr_alive = TRUE;
111  monitor->progress = 90 + 5 * word_index / word_count;
112  if (monitor->deadline_exceeded() ||
113  (monitor->cancel != nullptr &&
114  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
115  return;
116  }
117  }
118 
119  if (!word_res_it_from.at_last()) {
120  word_res_it_to = word_res_it_from;
121  prevent_null_wd_fixsp =
122  word_res->word->cblob_list()->empty();
123  if (check_debug_pt(word_res, 60))
124  debug_fix_space_level.set_value(10);
125  word_res_it_to.forward();
126  word_index++;
127  if (monitor != nullptr) {
128  monitor->ocr_alive = TRUE;
129  monitor->progress = 90 + 5 * word_index / word_count;
130  if (monitor->deadline_exceeded() ||
131  (monitor->cancel != nullptr &&
132  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
133  return;
134  }
135  while (!word_res_it_to.at_last () &&
136  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
137  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
138  if (check_debug_pt(word_res, 60))
139  debug_fix_space_level.set_value(10);
140  if (word_res->word->cblob_list()->empty())
141  prevent_null_wd_fixsp = true;
142  word_res = word_res_it_to.forward();
143  }
144  if (check_debug_pt(word_res, 60))
145  debug_fix_space_level.set_value(10);
146  if (word_res->word->cblob_list()->empty())
147  prevent_null_wd_fixsp = true;
148  if (prevent_null_wd_fixsp) {
149  word_res_it_from = word_res_it_to;
150  } else {
151  fuzzy_space_words.assign_to_sublist(&word_res_it_from,
152  &word_res_it_to);
153  fix_fuzzy_space_list(fuzzy_space_words,
154  row_res_it.data()->row,
155  block_res_it.data()->block);
156  new_length = fuzzy_space_words.length();
157  word_res_it_from.add_list_before(&fuzzy_space_words);
158  for (;
159  !word_res_it_from.at_last() && new_length > 0;
160  new_length--) {
161  word_res_it_from.forward();
162  }
163  }
164  if (test_pt)
165  debug_fix_space_level.set_value(0);
166  }
167  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
168  block_res_it.data()->block);
169  // Last word in row
170  }
171  }
172  }
173 }
174 
175 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
176  ROW *row,
177  BLOCK* block) {
178  int16_t best_score;
179  WERD_RES_LIST current_perm;
180  int16_t current_score;
181  bool improved = false;
182 
183  best_score = eval_word_spacing(best_perm); // default score
184  dump_words(best_perm, best_score, 1, improved);
185 
186  if (best_score != PERFECT_WERDS)
187  initialise_search(best_perm, current_perm);
188 
189  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
190  match_current_words(current_perm, row, block);
191  current_score = eval_word_spacing(current_perm);
192  dump_words(current_perm, current_score, 2, improved);
193  if (current_score > best_score) {
194  best_perm.clear();
195  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
196  best_score = current_score;
197  improved = true;
198  }
199  if (current_score < PERFECT_WERDS)
200  transform_to_next_perm(current_perm);
201  }
202  dump_words(best_perm, best_score, 3, improved);
203 }
204 
205 } // namespace tesseract
206 
207 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
208  WERD_RES_IT src_it(&src_list);
209  WERD_RES_IT new_it(&new_list);
210  WERD_RES *src_wd;
211  WERD_RES *new_wd;
212 
213  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
214  src_wd = src_it.data();
215  if (!src_wd->combination) {
216  new_wd = WERD_RES::deep_copy(src_wd);
217  new_wd->combination = false;
218  new_wd->part_of_combo = false;
219  new_it.add_after_then_move(new_wd);
220  }
221  }
222 }
223 
224 
225 namespace tesseract {
226 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
227  BLOCK* block) {
228  WERD_RES_IT word_it(&words);
229  WERD_RES *word;
230  // Since we are not using PAGE_RES to iterate over words, we need to update
231  // prev_word_best_choice_ before calling classify_word_pass2().
232  prev_word_best_choice_ = nullptr;
233  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
234  word = word_it.data();
235  if ((!word->part_of_combo) && (word->box_word == nullptr)) {
236  WordData word_data(block, row, word);
237  SetupWordPassN(2, &word_data);
238  classify_word_and_language(2, nullptr, &word_data);
239  }
241  }
242 }
243 
269 int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
270  WERD_RES_IT word_res_it(&word_res_list);
271  int16_t total_score = 0;
272  int16_t word_count = 0;
273  int16_t done_word_count = 0;
274  int16_t word_len;
275  int16_t i;
276  int16_t offset;
277  WERD_RES *word; // current word
278  int16_t prev_word_score = 0;
279  bool prev_word_done = false;
280  bool prev_char_1 = false; // prev ch a "1/I/l"?
281  bool prev_char_digit = false; // prev ch 2..9 or 0
282  bool current_char_1 = false;
283  bool current_word_ok_so_far;
284  STRING punct_chars = "!\"`',.:;";
285  bool prev_char_punct = false;
286  bool current_char_punct = false;
287  bool word_done = false;
288 
289  do {
290  word = word_res_it.data();
291  word_done = fixspace_thinks_word_done(word);
292  word_count++;
293  if (word->tess_failed) {
294  total_score += prev_word_score;
295  if (prev_word_done)
296  done_word_count++;
297  prev_word_score = 0;
298  prev_char_1 = false;
299  prev_char_digit = false;
300  prev_word_done = false;
301  } else {
302  /*
303  Can we add the prev word score and potentially count this word?
304  Yes IF it didn't end in a 1 when the first char of this word is a digit
305  AND it didn't end in a digit when the first char of this word is a 1
306  */
307  word_len = word->reject_map.length();
308  current_word_ok_so_far = false;
309  if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
310  (prev_char_digit && (
311  (word_done &&
312  word->best_choice->unichar_lengths().string()[0] == 1 &&
313  word->best_choice->unichar_string()[0] == '1') ||
314  (!word_done && STRING(conflict_set_I_l_1).contains(
315  word->best_choice->unichar_string()[0])))))) {
316  total_score += prev_word_score;
317  if (prev_word_done)
318  done_word_count++;
319  current_word_ok_so_far = word_done;
320  }
321 
322  if (current_word_ok_so_far) {
323  prev_word_done = true;
324  prev_word_score = word_len;
325  } else {
326  prev_word_done = false;
327  prev_word_score = 0;
328  }
329 
330  /* Add 1 to total score for every joined 1 regardless of context and
331  rejtn */
332  for (i = 0, prev_char_1 = false; i < word_len; i++) {
333  current_char_1 = word->best_choice->unichar_string()[i] == '1';
334  if (prev_char_1 || (current_char_1 && (i > 0)))
335  total_score++;
336  prev_char_1 = current_char_1;
337  }
338 
339  /* Add 1 to total score for every joined punctuation regardless of context
340  and rejtn */
342  for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
343  offset += word->best_choice->unichar_lengths()[i++]) {
344  current_char_punct =
345  punct_chars.contains(word->best_choice->unichar_string()[offset]);
346  if (prev_char_punct || (current_char_punct && i > 0))
347  total_score++;
348  prev_char_punct = current_char_punct;
349  }
350  }
351  prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
352  for (i = 0, offset = 0; i < word_len - 1;
353  offset += word->best_choice->unichar_lengths()[i++]);
354  prev_char_1 =
355  ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
356  || (!word_done && STRING(conflict_set_I_l_1).contains(
357  word->best_choice->unichar_string()[offset])));
358  }
359  /* Find next word */
360  do {
361  word_res_it.forward();
362  } while (word_res_it.data()->part_of_combo);
363  } while (!word_res_it.at_first());
364  total_score += prev_word_score;
365  if (prev_word_done)
366  done_word_count++;
367  if (done_word_count == word_count)
368  return PERFECT_WERDS;
369  else
370  return total_score;
371 }
372 
373 bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
374  int i;
375  int offset;
376 
377  for (i = 0, offset = 0; i < char_position;
378  offset += word->best_choice->unichar_lengths()[i++]);
379  return (
380  word->uch_set->get_isdigit(
381  word->best_choice->unichar_string().string() + offset,
382  word->best_choice->unichar_lengths()[i]) ||
383  (word->best_choice->permuter() == NUMBER_PERM &&
385  word->best_choice->unichar_string().string()[offset])));
386 }
387 
388 } // namespace tesseract
389 
390 
402 void transform_to_next_perm(WERD_RES_LIST &words) {
403  WERD_RES_IT word_it(&words);
404  WERD_RES_IT prev_word_it(&words);
405  WERD_RES *word;
406  WERD_RES *prev_word;
407  WERD_RES *combo;
408  WERD *copy_word;
409  int16_t prev_right = -INT16_MAX;
410  TBOX box;
411  int16_t gap;
412  int16_t min_gap = INT16_MAX;
413 
414  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
415  word = word_it.data();
416  if (!word->part_of_combo) {
417  box = word->word->bounding_box();
418  if (prev_right > -INT16_MAX) {
419  gap = box.left() - prev_right;
420  if (gap < min_gap)
421  min_gap = gap;
422  }
423  prev_right = box.right();
424  }
425  }
426  if (min_gap < INT16_MAX) {
427  prev_right = -INT16_MAX; // back to start
428  word_it.set_to_list(&words);
429  // Note: we can't use cycle_pt due to inserted combos at start of list.
430  for (; (prev_right == -INT16_MAX) || !word_it.at_first();
431  word_it.forward()) {
432  word = word_it.data();
433  if (!word->part_of_combo) {
434  box = word->word->bounding_box();
435  if (prev_right > -INT16_MAX) {
436  gap = box.left() - prev_right;
437  if (gap <= min_gap) {
438  prev_word = prev_word_it.data();
439  if (prev_word->combination) {
440  combo = prev_word;
441  } else {
442  /* Make a new combination and insert before
443  * the first word being joined. */
444  copy_word = new WERD;
445  *copy_word = *(prev_word->word);
446  // deep copy
447  combo = new WERD_RES(copy_word);
448  combo->combination = TRUE;
449  combo->x_height = prev_word->x_height;
450  prev_word->part_of_combo = true;
451  prev_word_it.add_before_then_move(combo);
452  }
453  combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
454  if (word->combination) {
455  combo->word->join_on(word->word);
456  // Move blobs to combo
457  // old combo no longer needed
458  delete word_it.extract();
459  } else {
460  // Copy current wd to combo
461  combo->copy_on(word);
462  word->part_of_combo = true;
463  }
464  combo->done = FALSE;
465  combo->ClearResults();
466  } else {
467  prev_word_it = word_it; // catch up
468  }
469  }
470  prev_right = box.right();
471  }
472  }
473  } else {
474  words.clear(); // signal termination
475  }
476 }
477 
478 namespace tesseract {
479 void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score,
480  int16_t mode, bool improved) {
481  WERD_RES_IT word_res_it(&perm);
482 
483  if (debug_fix_space_level > 0) {
484  if (mode == 1) {
485  stats_.dump_words_str = "";
486  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
487  word_res_it.forward()) {
488  if (!word_res_it.data()->part_of_combo) {
489  stats_.dump_words_str +=
490  word_res_it.data()->best_choice->unichar_string();
491  stats_.dump_words_str += ' ';
492  }
493  }
494  }
495 
496  if (debug_fix_space_level > 1) {
497  switch (mode) {
498  case 1:
499  tprintf("EXTRACTED (%d): \"", score);
500  break;
501  case 2:
502  tprintf("TESTED (%d): \"", score);
503  break;
504  case 3:
505  tprintf("RETURNED (%d): \"", score);
506  break;
507  }
508 
509  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
510  word_res_it.forward()) {
511  if (!word_res_it.data()->part_of_combo) {
512  tprintf("%s/%1d ",
513  word_res_it.data()->best_choice->unichar_string().string(),
514  (int)word_res_it.data()->best_choice->permuter());
515  }
516  }
517  tprintf("\"\n");
518  } else if (improved) {
519  tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
520  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
521  word_res_it.forward()) {
522  if (!word_res_it.data()->part_of_combo) {
523  tprintf("%s/%1d ",
524  word_res_it.data()->best_choice->unichar_string().string(),
525  (int)word_res_it.data()->best_choice->permuter());
526  }
527  }
528  tprintf("\"\n");
529  }
530  }
531 }
532 
534  if (word->done)
535  return true;
536 
537  /*
538  Use all the standard pass 2 conditions for mode 5 in set_done() in
539  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
540  CARE WHETHER WE HAVE of/at on/an etc.
541  */
542  if (fixsp_done_mode > 0 &&
543  (word->tess_accepted ||
544  (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
545  fixsp_done_mode == 3) &&
546  (strchr(word->best_choice->unichar_string().string(), ' ') == nullptr) &&
547  ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
548  (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
549  (word->best_choice->permuter() == USER_DAWG_PERM) ||
550  (word->best_choice->permuter() == NUMBER_PERM))) {
551  return true;
552  } else {
553  return false;
554  }
555 }
556 
557 
565 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
566  BLOCK* block) {
567  WERD_RES *word_res;
568  WERD_RES_LIST sub_word_list;
569  WERD_RES_IT sub_word_list_it(&sub_word_list);
570  int16_t blob_index;
571  int16_t new_length;
572  float junk;
573 
574  word_res = word_res_it.data();
575  if (word_res->word->flag(W_REP_CHAR) ||
576  word_res->combination ||
577  word_res->part_of_combo ||
578  !word_res->word->flag(W_DONT_CHOP))
579  return;
580 
581  blob_index = worst_noise_blob(word_res, &junk);
582  if (blob_index < 0)
583  return;
584 
585  if (debug_fix_space_level > 1) {
586  tprintf("FP fixspace working on \"%s\"\n",
587  word_res->best_choice->unichar_string().string());
588  }
589  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
590  sub_word_list_it.add_after_stay_put(word_res_it.extract());
591  fix_noisy_space_list(sub_word_list, row, block);
592  new_length = sub_word_list.length();
593  word_res_it.add_list_before(&sub_word_list);
594  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
595  word_res_it.forward();
596  }
597 }
598 
599 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
600  BLOCK* block) {
601  int16_t best_score;
602  WERD_RES_IT best_perm_it(&best_perm);
603  WERD_RES_LIST current_perm;
604  WERD_RES_IT current_perm_it(&current_perm);
605  WERD_RES *old_word_res;
606  int16_t current_score;
607  bool improved = false;
608 
609  best_score = fp_eval_word_spacing(best_perm); // default score
610 
611  dump_words(best_perm, best_score, 1, improved);
612 
613  old_word_res = best_perm_it.data();
614  // Even deep_copy doesn't copy the underlying WERD unless its combination
615  // flag is true!.
616  old_word_res->combination = true; // Kludge to force deep copy
617  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
618  old_word_res->combination = false; // Undo kludge
619 
620  break_noisiest_blob_word(current_perm);
621 
622  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
623  match_current_words(current_perm, row, block);
624  current_score = fp_eval_word_spacing(current_perm);
625  dump_words(current_perm, current_score, 2, improved);
626  if (current_score > best_score) {
627  best_perm.clear();
628  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
629  best_score = current_score;
630  improved = true;
631  }
632  if (current_score < PERFECT_WERDS) {
633  break_noisiest_blob_word(current_perm);
634  }
635  }
636  dump_words(best_perm, best_score, 3, improved);
637 }
638 
639 
645 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
646  WERD_RES_IT word_it(&words);
647  WERD_RES_IT worst_word_it;
648  float worst_noise_score = 9999;
649  int worst_blob_index = -1; // Noisiest blob of noisiest wd
650  int blob_index; // of wds noisiest blob
651  float noise_score; // of wds noisiest blob
652  WERD_RES *word_res;
653  C_BLOB_IT blob_it;
654  C_BLOB_IT rej_cblob_it;
655  C_BLOB_LIST new_blob_list;
656  C_BLOB_IT new_blob_it;
657  C_BLOB_IT new_rej_cblob_it;
658  WERD *new_word;
659  int16_t start_of_noise_blob;
660  int16_t i;
661 
662  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
663  blob_index = worst_noise_blob(word_it.data(), &noise_score);
664  if (blob_index > -1 && worst_noise_score > noise_score) {
665  worst_noise_score = noise_score;
666  worst_blob_index = blob_index;
667  worst_word_it = word_it;
668  }
669  }
670  if (worst_blob_index < 0) {
671  words.clear(); // signal termination
672  return;
673  }
674 
675  /* Now split the worst_word_it */
676 
677  word_res = worst_word_it.data();
678 
679  /* Move blobs before noise blob to a new bloblist */
680 
681  new_blob_it.set_to_list(&new_blob_list);
682  blob_it.set_to_list(word_res->word->cblob_list());
683  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
684  new_blob_it.add_after_then_move(blob_it.extract());
685  }
686  start_of_noise_blob = blob_it.data()->bounding_box().left();
687  delete blob_it.extract(); // throw out noise blob
688 
689  new_word = new WERD(&new_blob_list, word_res->word);
690  new_word->set_flag(W_EOL, FALSE);
691  word_res->word->set_flag(W_BOL, FALSE);
692  word_res->word->set_blanks(1); // After break
693 
694  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
695  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
696  for (;
697  (!rej_cblob_it.empty() &&
698  (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
699  rej_cblob_it.forward()) {
700  new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
701  }
702 
703  WERD_RES* new_word_res = new WERD_RES(new_word);
704  new_word_res->combination = true;
705  worst_word_it.add_before_then_move(new_word_res);
706 
707  word_res->ClearResults();
708 }
709 
711  float *worst_noise_score) {
712  float noise_score[512];
713  int i;
714  int min_noise_blob; // 1st contender
715  int max_noise_blob; // last contender
716  int non_noise_count;
717  int worst_noise_blob; // Worst blob
718  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
719  float non_noise_limit = kBlnXHeight * 0.8;
720 
721  if (word_res->rebuild_word == nullptr)
722  return -1; // Can't handle cube words.
723 
724  // Normalised.
725  int blob_count = word_res->box_word->length();
726  ASSERT_HOST(blob_count <= 512);
727  if (blob_count < 5)
728  return -1; // too short to split
729 
730  /* Get the noise scores for all blobs */
731 
732  #ifndef SECURE_NAMES
733  if (debug_fix_space_level > 5)
734  tprintf("FP fixspace Noise metrics for \"%s\": ",
735  word_res->best_choice->unichar_string().string());
736  #endif
737 
738  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
739  TBLOB* blob = word_res->rebuild_word->blobs[i];
740  if (word_res->reject_map[i].accepted())
741  noise_score[i] = non_noise_limit;
742  else
743  noise_score[i] = blob_noise_score(blob);
744 
745  if (debug_fix_space_level > 5)
746  tprintf("%1.1f ", noise_score[i]);
747  }
748  if (debug_fix_space_level > 5)
749  tprintf("\n");
750 
751  /* Now find the worst one which is far enough away from the end of the word */
752 
753  non_noise_count = 0;
754  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
755  if (noise_score[i] >= non_noise_limit) {
756  non_noise_count++;
757  }
758  }
759  if (non_noise_count < fixsp_non_noise_limit)
760  return -1;
761 
762  min_noise_blob = i;
763 
764  non_noise_count = 0;
765  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
766  i--) {
767  if (noise_score[i] >= non_noise_limit) {
768  non_noise_count++;
769  }
770  }
771  if (non_noise_count < fixsp_non_noise_limit)
772  return -1;
773 
774  max_noise_blob = i;
775 
776  if (min_noise_blob > max_noise_blob)
777  return -1;
778 
779  *worst_noise_score = small_limit;
780  worst_noise_blob = -1;
781  for (i = min_noise_blob; i <= max_noise_blob; i++) {
782  if (noise_score[i] < *worst_noise_score) {
783  worst_noise_blob = i;
784  *worst_noise_score = noise_score[i];
785  }
786  }
787  return worst_noise_blob;
788 }
789 
791  TBOX box; // BB of outline
792  int16_t outline_count = 0;
793  int16_t max_dimension;
794  int16_t largest_outline_dimension = 0;
795 
796  for (TESSLINE* ol = blob->outlines; ol != nullptr; ol= ol->next) {
797  outline_count++;
798  box = ol->bounding_box();
799  if (box.height() > box.width()) {
800  max_dimension = box.height();
801  } else {
802  max_dimension = box.width();
803  }
804 
805  if (largest_outline_dimension < max_dimension)
806  largest_outline_dimension = max_dimension;
807  }
808 
809  if (outline_count > 5) {
810  // penalise LOTS of blobs
811  largest_outline_dimension *= 2;
812  }
813 
814  box = blob->bounding_box();
815  if (box.bottom() > kBlnBaselineOffset * 4 ||
816  box.top() < kBlnBaselineOffset / 2) {
817  // Lax blob is if high or low
818  largest_outline_dimension /= 2;
819  }
820 
821  return largest_outline_dimension;
822 }
823 } // namespace tesseract
824 
825 void fixspace_dbg(WERD_RES *word) {
826  TBOX box = word->word->bounding_box();
827  const bool show_map_detail = false;
828  int16_t i;
829 
830  box.print();
831  tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
832  tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",
833  word->word->cblob_list()->length(),
834  word->rebuild_word->NumBlobs(),
835  word->box_word->length());
836  word->reject_map.print(debug_fp);
837  tprintf("\n");
838  if (show_map_detail) {
839  tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
840  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
841  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
842  word->reject_map[i].full_print(debug_fp);
843  }
844  }
845 
846  tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
847  tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
848 }
849 
850 
859 namespace tesseract {
860 int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
861  WERD_RES_IT word_it(&word_res_list);
862  WERD_RES *word;
863  int16_t score = 0;
864  int16_t i;
865  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
866 
867  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
868  word = word_it.data();
869  if (word->rebuild_word == nullptr)
870  continue; // Can't handle cube words.
871  if (word->done ||
872  word->tess_accepted ||
873  word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
874  word->best_choice->permuter() == FREQ_DAWG_PERM ||
875  word->best_choice->permuter() == USER_DAWG_PERM ||
876  safe_dict_word(word) > 0) {
877  int num_blobs = word->rebuild_word->NumBlobs();
878  UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
879  for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
880  TBLOB* blob = word->rebuild_word->blobs[i];
881  if (word->best_choice->unichar_id(i) == space ||
882  blob_noise_score(blob) < small_limit) {
883  score -= 1; // penalise possibly erroneous non-space
884  } else if (word->reject_map[i].accepted()) {
885  score++;
886  }
887  }
888  }
889  }
890  if (score < 0)
891  score = 0;
892  return score;
893 }
894 
895 } // namespace tesseract
bool tess_failed
Definition: pageres.h:288
TWERD * rebuild_word
Definition: pageres.h:260
TESSLINE * next
Definition: blobs.h:265
int UNICHAR_ID
Definition: unichar.h:35
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:373
#define TRUE
Definition: capi.h:51
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
FILE * debug_fp
Definition: tessvars.cpp:24
void print() const
Definition: rect.h:278
REJMAP reject_map
Definition: pageres.h:287
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:132
const char * string() const
Definition: strngs.cpp:196
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:649
void full_print(FILE *fp)
Definition: rejctmap.cpp:335
TBOX bounding_box() const
Definition: werd.cpp:159
uint8_t permuter() const
Definition: ratngs.h:346
Definition: rect.h:34
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:565
int NumBlobs() const
Definition: blobs.h:432
int32_t length() const
Definition: rejctmap.h:223
Definition: werd.h:35
const int kBlnXHeight
Definition: normalis.h:24
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
const int kBlnBaselineOffset
Definition: normalis.h:25
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:127
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:269
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:93
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:479
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:127
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:790
void fixspace_dbg(WERD_RES *word)
Definition: fixspace.cpp:825
int16_t width() const
Definition: rect.h:115
int16_t left() const
Definition: rect.h:72
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
const STRING & unichar_lengths() const
Definition: ratngs.h:548
int16_t top() const
Definition: rect.h:58
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:612
#define PERFECT_WERDS
Definition: fixspace.cpp:46
int16_t reject_count()
Definition: rejctmap.h:229
#define FALSE
Definition: capi.h:52
bool deadline_exceeded() const
Definition: ocrclass.h:164
bool fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:533
bool tess_accepted
Definition: pageres.h:296
void set_blanks(uint8_t new_blanks)
Definition: werd.h:105
void copy_on(WERD_RES *word_res)
Definition: pageres.h:660
Definition: werd.h:59
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
Definition: ocrrow.h:36
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
TBOX bounding_box() const
Definition: blobs.cpp:478
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:175
Definition: werd.h:34
Definition: ocrblock.h:30
int length() const
Definition: ratngs.h:303
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
bool done
Definition: pageres.h:298
bool combination
Definition: pageres.h:334
float x_height
Definition: pageres.h:311
void ClearResults()
Definition: pageres.cpp:1153
Definition: strngs.h:45
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:207
bool contains(const char c) const
Definition: strngs.cpp:187
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:182
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:78
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:710
TBOX bounding_box() const
Definition: stepblob.cpp:255
const UNICHARSET * uch_set
Definition: pageres.h:206
int length() const
Definition: boxword.h:83
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:226
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:129
const STRING & unichar_string() const
Definition: ratngs.h:541
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:599
bool part_of_combo
Definition: pageres.h:335
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868
int16_t right() const
Definition: rect.h:79
Definition: blobs.h:268
void join_on(WERD *other)
Definition: werd.cpp:210
int16_t bottom() const
Definition: rect.h:65
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:402
TESSLINE * outlines
Definition: blobs.h:384
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:122
WERD_CHOICE * best_choice
Definition: pageres.h:235
int16_t height() const
Definition: rect.h:108
tesseract::BoxWord * box_word
Definition: pageres.h:266
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:645
void print(FILE *fp)
Definition: rejctmap.cpp:323
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:860
#define ASSERT_HOST(x)
Definition: errcode.h:84
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1338
WERD * word
Definition: pageres.h:189