tesseract  4.0.0-1-g2a2b
docqual.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: docqual.cpp (Formerly docqual.c)
3  * Description: Document Quality Metrics
4  * Author: Phil Cheatle
5  * Created: Mon May 9 11:27:28 BST 1994
6  *
7  * (C) Copyright 1994, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <cctype>
21 #include "docqual.h"
22 #include "reject.h"
23 #include "tesscallback.h"
24 #include "tessvars.h"
25 #include "globals.h"
26 #include "tesseractclass.h"
27 
28 namespace tesseract{
29 
30 // A little class to provide the callbacks as we have no pre-bound args.
32  explicit DocQualCallbacks(WERD_RES* word0)
33  : word(word0), match_count(0), accepted_match_count(0) {}
34 
35  void CountMatchingBlobs(int index) {
36  ++match_count;
37  }
38 
39  void CountAcceptedBlobs(int index) {
40  if (word->reject_map[index].accepted())
42  ++match_count;
43  }
44 
45  void AcceptIfGoodQuality(int index) {
46  if (word->reject_map[index].accept_if_good_quality())
47  word->reject_map[index].setrej_quality_accept();
48  }
49 
51  int16_t match_count;
53 };
54 
55 /*************************************************************************
56  * word_blob_quality()
57  * How many blobs in the box_word are identical to those of the inword?
58  * ASSUME blobs in both initial word and box_word are in ascending order of
59  * left hand blob edge.
60  *************************************************************************/
62  if (word->bln_boxes == nullptr ||
63  word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
64  return 0;
65 
66  DocQualCallbacks cb(word);
68  *word->rebuild_word,
70  return cb.match_count;
71 }
72 
74  int16_t i = 0;
75  int16_t err_count = 0;
76 
77  if (word->rebuild_word != nullptr) {
78  for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
79  TBLOB* blob = word->rebuild_word->blobs[b];
80  err_count += count_outline_errs(word->best_choice->unichar_string()[i],
81  blob->NumOutlines());
82  i++;
83  }
84  }
85  return err_count;
86 }
87 
88 /*************************************************************************
89  * word_char_quality()
90  * Combination of blob quality and outline quality - how many good chars are
91  * there? - I.e chars which pass the blob AND outline tests.
92  *************************************************************************/
94  ROW *row,
95  int16_t *match_count,
96  int16_t *accepted_match_count) {
97  if (word->bln_boxes == nullptr || word->rebuild_word == nullptr ||
98  word->rebuild_word->blobs.empty()) {
99  *match_count = 0;
100  *accepted_match_count = 0;
101  return;
102  }
103 
104  DocQualCallbacks cb(word);
106  *word->rebuild_word,
108  *match_count = cb.match_count;
109  *accepted_match_count = cb.accepted_match_count;
110 }
111 
112 /*************************************************************************
113  * unrej_good_chs()
114  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
115  *************************************************************************/
117  if (word->bln_boxes == nullptr ||
118  word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
119  return;
120 
121  DocQualCallbacks cb(word);
123  *word->rebuild_word,
125 }
126 
127 int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
128  int expected_outline_count;
129 
130  if (STRING (outlines_odd).contains (c))
131  return 0; // Don't use this char
132  else if (STRING (outlines_2).contains (c))
133  expected_outline_count = 2;
134  else
135  expected_outline_count = 1;
136  return abs (outline_count - expected_outline_count);
137 }
138 
140  bool good_quality_doc) {
141  if ((tessedit_good_quality_unrej && good_quality_doc))
142  unrej_good_quality_words(page_res_it);
143  doc_and_block_rejection(page_res_it, good_quality_doc);
144  if (unlv_tilde_crunching) {
145  tilde_crunch(page_res_it);
146  tilde_delete(page_res_it);
147  }
148 }
149 
150 /*************************************************************************
151  * unrej_good_quality_words()
152  * Accept potential rejects in words which pass the following checks:
153  * - Contains a potential reject
154  * - Word looks like a sensible alpha word.
155  * - Word segmentation is the same as the original image
156  * - All characters have the expected number of outlines
157  * NOTE - the rejection counts are recalculated after unrejection
158  * - CAN'T do it in a single pass without a bit of fiddling
159  * - keep it simple but inefficient
160  *************************************************************************/
161 void Tesseract::unrej_good_quality_words( //unreject potential
162  PAGE_RES_IT &page_res_it) {
163  WERD_RES *word;
164  ROW_RES *current_row;
165  BLOCK_RES *current_block;
166  int i;
167 
168  page_res_it.restart_page ();
169  while (page_res_it.word () != nullptr) {
170  check_debug_pt (page_res_it.word (), 100);
171  if (bland_unrej) {
172  word = page_res_it.word ();
173  for (i = 0; i < word->reject_map.length (); i++) {
174  if (word->reject_map[i].accept_if_good_quality ())
175  word->reject_map[i].setrej_quality_accept ();
176  }
177  page_res_it.forward ();
178  }
179  else if ((page_res_it.row ()->char_count > 0) &&
180  ((page_res_it.row ()->rej_count /
181  (float) page_res_it.row ()->char_count) <=
183  word = page_res_it.word ();
187  word->best_choice->unichar_string().string(),
189  != AC_UNACCEPTABLE)) {
190  unrej_good_chs(word, page_res_it.row ()->row);
191  }
192  page_res_it.forward ();
193  }
194  else {
195  /* Skip to end of dodgy row */
196  current_row = page_res_it.row ();
197  while ((page_res_it.word () != nullptr) &&
198  (page_res_it.row () == current_row))
199  page_res_it.forward ();
200  }
201  check_debug_pt (page_res_it.word (), 110);
202  }
203  page_res_it.restart_page ();
204  page_res_it.page_res->char_count = 0;
205  page_res_it.page_res->rej_count = 0;
206  current_block = nullptr;
207  current_row = nullptr;
208  while (page_res_it.word () != nullptr) {
209  if (current_block != page_res_it.block ()) {
210  current_block = page_res_it.block ();
211  current_block->char_count = 0;
212  current_block->rej_count = 0;
213  }
214  if (current_row != page_res_it.row ()) {
215  current_row = page_res_it.row ();
216  current_row->char_count = 0;
217  current_row->rej_count = 0;
218  current_row->whole_word_rej_count = 0;
219  }
220  page_res_it.rej_stat_word ();
221  page_res_it.forward ();
222  }
223 }
224 
225 
226 /*************************************************************************
227  * doc_and_block_rejection()
228  *
229  * If the page has too many rejects - reject all of it.
230  * If any block has too many rejects - reject all words in the block
231  *************************************************************************/
232 
233 void Tesseract::doc_and_block_rejection( //reject big chunks
234  PAGE_RES_IT &page_res_it,
235  bool good_quality_doc) {
236  int16_t block_no = 0;
237  int16_t row_no = 0;
238  BLOCK_RES *current_block;
239  ROW_RES *current_row;
240 
241  bool rej_word;
242  bool prev_word_rejected;
243  int16_t char_quality = 0;
244  int16_t accepted_char_quality;
245 
246  if (page_res_it.page_res->rej_count * 100.0 /
248  reject_whole_page(page_res_it);
250  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
251  page_res_it.page_res->char_count,
252  page_res_it.page_res->rej_count);
253  }
254  } else {
256  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
257  page_res_it.page_res->char_count,
258  page_res_it.page_res->rej_count);
259  }
260 
261  /* Walk blocks testing for block rejection */
262 
263  page_res_it.restart_page();
264  WERD_RES* word;
265  while ((word = page_res_it.word()) != nullptr) {
266  current_block = page_res_it.block();
267  block_no = current_block->block->pdblk.index();
268  if (current_block->char_count > 0 &&
269  (current_block->rej_count * 100.0 / current_block->char_count) >
272  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
273  block_no, current_block->char_count,
274  current_block->rej_count);
275  }
276  prev_word_rejected = false;
277  while ((word = page_res_it.word()) != nullptr &&
278  (page_res_it.block() == current_block)) {
280  rej_word = word->reject_map.reject_count() > 0 ||
282  if (rej_word && tessedit_dont_blkrej_good_wds &&
285  *word->uch_set,
286  word->best_choice->unichar_string().string(),
287  word->best_choice->unichar_lengths().string()) !=
288  AC_UNACCEPTABLE) {
289  word_char_quality(word, page_res_it.row()->row,
290  &char_quality,
291  &accepted_char_quality);
292  rej_word = char_quality != word->reject_map.length();
293  }
294  } else {
295  rej_word = true;
296  }
297  if (rej_word) {
298  /*
299  Reject spacing if both current and prev words are rejected.
300  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
301  generated more space errors.
302  */
304  prev_word_rejected &&
305  page_res_it.prev_row() == page_res_it.row() &&
306  word->word->space() == 1)
307  word->reject_spaces = true;
309  }
310  prev_word_rejected = rej_word;
311  page_res_it.forward();
312  }
313  } else {
315  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
316  block_no, page_res_it.block()->char_count,
317  page_res_it.block()->rej_count);
318  }
319 
320  /* Walk rows in block testing for row rejection */
321  row_no = 0;
322  while (page_res_it.word() != nullptr &&
323  page_res_it.block() == current_block) {
324  current_row = page_res_it.row();
325  row_no++;
326  /* Reject whole row if:
327  fraction of chars on row which are rejected exceed a limit AND
328  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
329  limit
330  */
331  if (current_row->char_count > 0 &&
332  (current_row->rej_count * 100.0 / current_row->char_count) >
334  (current_row->whole_word_rej_count * 100.0 /
335  current_row->rej_count) <
338  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
339  row_no, current_row->char_count,
340  current_row->rej_count);
341  }
342  prev_word_rejected = false;
343  while ((word = page_res_it.word()) != nullptr &&
344  page_res_it.row () == current_row) {
345  /* Preserve words on good docs unless they are mostly rejected*/
346  if (!tessedit_row_rej_good_docs && good_quality_doc) {
347  rej_word = word->reject_map.reject_count() /
348  static_cast<float>(word->reject_map.length()) >
351  /* Preserve perfect words anyway */
352  rej_word = word->reject_map.reject_count() > 0 ||
354  if (rej_word && tessedit_dont_rowrej_good_wds &&
357  word->best_choice->unichar_string().string(),
358  word->best_choice->unichar_lengths().string()) !=
359  AC_UNACCEPTABLE) {
360  word_char_quality(word, page_res_it.row()->row,
361  &char_quality,
362  &accepted_char_quality);
363  rej_word = char_quality != word->reject_map.length();
364  }
365  } else {
366  rej_word = true;
367  }
368  if (rej_word) {
369  /*
370  Reject spacing if both current and prev words are rejected.
371  NOTE - this is NOT restricted to FUZZY spaces. - When tried
372  this generated more space errors.
373  */
375  prev_word_rejected &&
376  page_res_it.prev_row() == page_res_it.row() &&
377  word->word->space () == 1)
378  word->reject_spaces = true;
380  }
381  prev_word_rejected = rej_word;
382  page_res_it.forward();
383  }
384  } else {
386  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
387  row_no, current_row->char_count, current_row->rej_count);
388  }
389  while (page_res_it.word() != nullptr &&
390  page_res_it.row() == current_row)
391  page_res_it.forward();
392  }
393  }
394  }
395  }
396  }
397 }
398 
399 } // namespace tesseract
400 
401 /*************************************************************************
402  * reject_whole_page()
403  * Don't believe any of it - set the reject map to 00..00 in all words
404  *
405  *************************************************************************/
406 
407 void reject_whole_page(PAGE_RES_IT &page_res_it) {
408  page_res_it.restart_page ();
409  while (page_res_it.word () != nullptr) {
410  page_res_it.word ()->reject_map.rej_word_doc_rej ();
411  page_res_it.forward ();
412  }
413  //whole page is rejected
414  page_res_it.page_res->rejected = true;
415 }
416 
417 namespace tesseract {
419  WERD_RES *word;
420  GARBAGE_LEVEL garbage_level;
421  PAGE_RES_IT copy_it;
422  bool prev_potential_marked = false;
423  bool found_terrible_word = false;
424  BOOL8 ok_dict_word;
425 
426  page_res_it.restart_page();
427  while (page_res_it.word() != nullptr) {
428  POLY_BLOCK* pb = page_res_it.block()->block->pdblk.poly_block();
429  if (pb != nullptr && !pb->IsText()) {
430  page_res_it.forward();
431  continue;
432  }
433  word = page_res_it.word();
434 
436  convert_bad_unlv_chs(word);
437 
439  word->merge_tess_fails();
440 
441  if (word->reject_map.accept_count () != 0) {
442  found_terrible_word = false;
443  //Forget earlier potential crunches
444  prev_potential_marked = false;
445  }
446  else {
447  ok_dict_word = safe_dict_word(word);
448  garbage_level = garbage_word(word, ok_dict_word);
449 
450  if ((garbage_level != G_NEVER_CRUNCH) &&
451  (terrible_word_crunch (word, garbage_level))) {
452  if (crunch_debug > 0) {
453  tprintf ("T CRUNCHING: \"%s\"\n",
454  word->best_choice->unichar_string().string());
455  }
457  if (prev_potential_marked) {
458  while (copy_it.word () != word) {
459  if (crunch_debug > 0) {
460  tprintf ("P1 CRUNCHING: \"%s\"\n",
461  copy_it.word()->best_choice->unichar_string().string());
462  }
463  copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
464  copy_it.forward ();
465  }
466  prev_potential_marked = false;
467  }
468  found_terrible_word = true;
469  }
470  else if ((garbage_level != G_NEVER_CRUNCH) &&
471  (potential_word_crunch (word,
472  garbage_level, ok_dict_word))) {
473  if (found_terrible_word) {
474  if (crunch_debug > 0) {
475  tprintf ("P2 CRUNCHING: \"%s\"\n",
476  word->best_choice->unichar_string().string());
477  }
479  }
480  else if (!prev_potential_marked) {
481  copy_it = page_res_it;
482  prev_potential_marked = true;
483  if (crunch_debug > 1) {
484  tprintf ("P3 CRUNCHING: \"%s\"\n",
485  word->best_choice->unichar_string().string());
486  }
487  }
488  }
489  else {
490  found_terrible_word = false;
491  //Forget earlier potential crunches
492  prev_potential_marked = false;
493  if (crunch_debug > 2) {
494  tprintf ("NO CRUNCH: \"%s\"\n",
495  word->best_choice->unichar_string().string());
496  }
497  }
498  }
499  page_res_it.forward ();
500  }
501 }
502 
503 
505  GARBAGE_LEVEL garbage_level) {
506  float rating_per_ch;
507  int adjusted_len;
508  int crunch_mode = 0;
509 
510  if ((word->best_choice->unichar_string().length() == 0) ||
511  (strspn(word->best_choice->unichar_string().string(), " ") ==
513  crunch_mode = 1;
514  else {
515  adjusted_len = word->reject_map.length ();
516  if (adjusted_len > crunch_rating_max)
517  adjusted_len = crunch_rating_max;
518  rating_per_ch = word->best_choice->rating () / adjusted_len;
519 
520  if (rating_per_ch > crunch_terrible_rating)
521  crunch_mode = 2;
522  else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
523  crunch_mode = 3;
524  else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
525  (garbage_level != G_OK))
526  crunch_mode = 4;
527  else if ((rating_per_ch > crunch_poor_garbage_rate) &&
528  (garbage_level != G_OK))
529  crunch_mode = 5;
530  }
531  if (crunch_mode > 0) {
532  if (crunch_debug > 2) {
533  tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
534  crunch_mode, word->best_choice->unichar_string().string());
535  }
536  return true;
537  }
538  else
539  return false;
540 }
541 
543  GARBAGE_LEVEL garbage_level,
544  bool ok_dict_word) {
545  float rating_per_ch;
546  int adjusted_len;
547  const char *str = word->best_choice->unichar_string().string();
548  const char *lengths = word->best_choice->unichar_lengths().string();
549  bool word_crunchable;
550  int poor_indicator_count = 0;
551 
552  word_crunchable = !crunch_leave_accept_strings ||
553  word->reject_map.length() < 3 ||
555  str, lengths) == AC_UNACCEPTABLE &&
556  !ok_dict_word);
557 
558  adjusted_len = word->reject_map.length();
559  if (adjusted_len > 10)
560  adjusted_len = 10;
561  rating_per_ch = word->best_choice->rating() / adjusted_len;
562 
563  if (rating_per_ch > crunch_pot_poor_rate) {
564  if (crunch_debug > 2) {
565  tprintf("Potential poor rating on \"%s\"\n",
566  word->best_choice->unichar_string().string());
567  }
568  poor_indicator_count++;
569  }
570 
571  if (word_crunchable &&
573  if (crunch_debug > 2) {
574  tprintf("Potential poor cert on \"%s\"\n",
575  word->best_choice->unichar_string().string());
576  }
577  poor_indicator_count++;
578  }
579 
580  if (garbage_level != G_OK) {
581  if (crunch_debug > 2) {
582  tprintf("Potential garbage on \"%s\"\n",
583  word->best_choice->unichar_string().string());
584  }
585  poor_indicator_count++;
586  }
587  return poor_indicator_count >= crunch_pot_indicators;
588 }
589 
591  WERD_RES *word;
592  PAGE_RES_IT copy_it;
593  bool deleting_from_bol = false;
594  bool marked_delete_point = false;
595  int16_t debug_delete_mode;
596  CRUNCH_MODE delete_mode;
597  int16_t x_debug_delete_mode;
598  CRUNCH_MODE x_delete_mode;
599 
600  page_res_it.restart_page();
601  while (page_res_it.word() != nullptr) {
602  word = page_res_it.word();
603 
604  delete_mode = word_deletable (word, debug_delete_mode);
605  if (delete_mode != CR_NONE) {
606  if (word->word->flag (W_BOL) || deleting_from_bol) {
607  if (crunch_debug > 0) {
608  tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
609  debug_delete_mode,
610  word->best_choice->unichar_string().string());
611  }
612  word->unlv_crunch_mode = delete_mode;
613  deleting_from_bol = true;
614  } else if (word->word->flag(W_EOL)) {
615  if (marked_delete_point) {
616  while (copy_it.word() != word) {
617  x_delete_mode = word_deletable (copy_it.word (),
618  x_debug_delete_mode);
619  if (crunch_debug > 0) {
620  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
621  x_debug_delete_mode,
622  copy_it.word()->best_choice->unichar_string().string());
623  }
624  copy_it.word ()->unlv_crunch_mode = x_delete_mode;
625  copy_it.forward ();
626  }
627  }
628  if (crunch_debug > 0) {
629  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
630  debug_delete_mode,
631  word->best_choice->unichar_string().string());
632  }
633  word->unlv_crunch_mode = delete_mode;
634  deleting_from_bol = false;
635  marked_delete_point = false;
636  }
637  else {
638  if (!marked_delete_point) {
639  copy_it = page_res_it;
640  marked_delete_point = true;
641  }
642  }
643  }
644  else {
645  deleting_from_bol = false;
646  //Forget earlier potential crunches
647  marked_delete_point = false;
648  }
649  /*
650  The following step has been left till now as the tess fails are used to
651  determine if the word is deletable.
652  */
654  word->merge_tess_fails();
655  page_res_it.forward ();
656  }
657 }
658 
659 
661  int i;
662  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
663  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
664  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
665  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
666  for (i = 0; i < word_res->reject_map.length(); ++i) {
667  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
668  word_res->best_choice->set_unichar_id(unichar_dash, i);
669  if (word_res->reject_map[i].accepted ())
670  word_res->reject_map[i].setrej_unlv_rej ();
671  }
672  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
673  word_res->best_choice->set_unichar_id(unichar_space, i);
674  if (word_res->reject_map[i].accepted ())
675  word_res->reject_map[i].setrej_unlv_rej ();
676  }
677  }
678 }
679 
681  enum STATES
682  {
683  JUNK,
684  FIRST_UPPER,
685  FIRST_LOWER,
686  FIRST_NUM,
687  SUBSEQUENT_UPPER,
688  SUBSEQUENT_LOWER,
689  SUBSEQUENT_NUM
690  };
691  const char *str = word->best_choice->unichar_string().string();
692  const char *lengths = word->best_choice->unichar_lengths().string();
693  STATES state = JUNK;
694  int len = 0;
695  int isolated_digits = 0;
696  int isolated_alphas = 0;
697  int bad_char_count = 0;
698  int tess_rejs = 0;
699  int dodgy_chars = 0;
700  int ok_chars;
701  UNICHAR_ID last_char = -1;
702  int alpha_repetition_count = 0;
703  int longest_alpha_repetition_count = 0;
704  int longest_lower_run_len = 0;
705  int lower_string_count = 0;
706  int longest_upper_run_len = 0;
707  int upper_string_count = 0;
708  int total_alpha_count = 0;
709  int total_digit_count = 0;
710 
711  for (; *str != '\0'; str += *(lengths++)) {
712  len++;
713  if (word->uch_set->get_isupper (str, *lengths)) {
714  total_alpha_count++;
715  switch (state) {
716  case SUBSEQUENT_UPPER:
717  case FIRST_UPPER:
718  state = SUBSEQUENT_UPPER;
719  upper_string_count++;
720  if (longest_upper_run_len < upper_string_count)
721  longest_upper_run_len = upper_string_count;
722  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
723  alpha_repetition_count++;
724  if (longest_alpha_repetition_count < alpha_repetition_count) {
725  longest_alpha_repetition_count = alpha_repetition_count;
726  }
727  }
728  else {
729  last_char = word->uch_set->unichar_to_id(str, *lengths);
730  alpha_repetition_count = 1;
731  }
732  break;
733  case FIRST_NUM:
734  isolated_digits++;
735  default:
736  state = FIRST_UPPER;
737  last_char = word->uch_set->unichar_to_id(str, *lengths);
738  alpha_repetition_count = 1;
739  upper_string_count = 1;
740  break;
741  }
742  }
743  else if (word->uch_set->get_islower (str, *lengths)) {
744  total_alpha_count++;
745  switch (state) {
746  case SUBSEQUENT_LOWER:
747  case FIRST_LOWER:
748  state = SUBSEQUENT_LOWER;
749  lower_string_count++;
750  if (longest_lower_run_len < lower_string_count)
751  longest_lower_run_len = lower_string_count;
752  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
753  alpha_repetition_count++;
754  if (longest_alpha_repetition_count < alpha_repetition_count) {
755  longest_alpha_repetition_count = alpha_repetition_count;
756  }
757  }
758  else {
759  last_char = word->uch_set->unichar_to_id(str, *lengths);
760  alpha_repetition_count = 1;
761  }
762  break;
763  case FIRST_NUM:
764  isolated_digits++;
765  default:
766  state = FIRST_LOWER;
767  last_char = word->uch_set->unichar_to_id(str, *lengths);
768  alpha_repetition_count = 1;
769  lower_string_count = 1;
770  break;
771  }
772  }
773  else if (word->uch_set->get_isdigit (str, *lengths)) {
774  total_digit_count++;
775  switch (state) {
776  case FIRST_NUM:
777  state = SUBSEQUENT_NUM;
778  case SUBSEQUENT_NUM:
779  break;
780  case FIRST_UPPER:
781  case FIRST_LOWER:
782  isolated_alphas++;
783  default:
784  state = FIRST_NUM;
785  break;
786  }
787  }
788  else {
789  if (*lengths == 1 && *str == ' ')
790  tess_rejs++;
791  else
792  bad_char_count++;
793  switch (state) {
794  case FIRST_NUM:
795  isolated_digits++;
796  break;
797  case FIRST_UPPER:
798  case FIRST_LOWER:
799  isolated_alphas++;
800  default:
801  break;
802  }
803  state = JUNK;
804  }
805  }
806 
807  switch (state) {
808  case FIRST_NUM:
809  isolated_digits++;
810  break;
811  case FIRST_UPPER:
812  case FIRST_LOWER:
813  isolated_alphas++;
814  default:
815  break;
816  }
817 
819  total_alpha_count += total_digit_count - isolated_digits;
820  }
821 
822  if (crunch_leave_ok_strings && len >= 4 &&
823  2 * (total_alpha_count - isolated_alphas) > len &&
824  longest_alpha_repetition_count < crunch_long_repetitions) {
825  if ((crunch_accept_ok &&
826  acceptable_word_string(*word->uch_set, str, lengths) !=
827  AC_UNACCEPTABLE) ||
828  longest_lower_run_len > crunch_leave_lc_strings ||
829  longest_upper_run_len > crunch_leave_uc_strings)
830  return G_NEVER_CRUNCH;
831  }
832  if (word->reject_map.length() > 1 &&
833  strpbrk(str, " ") == nullptr &&
834  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
835  word->best_choice->permuter() == FREQ_DAWG_PERM ||
836  word->best_choice->permuter() == USER_DAWG_PERM ||
837  word->best_choice->permuter() == NUMBER_PERM ||
838  acceptable_word_string(*word->uch_set, str, lengths) !=
839  AC_UNACCEPTABLE || ok_dict_word))
840  return G_OK;
841 
842  ok_chars = len - bad_char_count - isolated_digits -
843  isolated_alphas - tess_rejs;
844 
845  if (crunch_debug > 3) {
846  tprintf("garbage_word: \"%s\"\n",
847  word->best_choice->unichar_string().string());
848  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
849  len,
850  bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
851  }
852  if (bad_char_count == 0 &&
853  tess_rejs == 0 &&
854  (len > isolated_digits + isolated_alphas || len <= 2))
855  return G_OK;
856 
857  if (tess_rejs > ok_chars ||
858  (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
859  return G_TERRIBLE;
860 
861  if (len > 4) {
862  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
863  isolated_alphas;
864  if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
865  return G_DODGY;
866  else
867  return G_OK;
868  } else {
869  dodgy_chars = 2 * tess_rejs + bad_char_count;
870  if ((len == 4 && dodgy_chars > 2) ||
871  (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
872  return G_DODGY;
873  else
874  return G_OK;
875  }
876 }
877 
878 
879 /*************************************************************************
880  * word_deletable()
881  * DELETE WERDS AT ENDS OF ROWS IF
882  * Word is crunched &&
883  * ( string length = 0 OR
884  * > 50% of chars are "|" (before merging) OR
885  * certainty < -10 OR
886  * rating /char > 60 OR
887  * TOP of word is more than 0.5 xht BELOW baseline OR
888  * BOTTOM of word is more than 0.5 xht ABOVE xht OR
889  * length of word < 3xht OR
890  * height of word < 0.7 xht OR
891  * height of word > 3.0 xht OR
892  * >75% of the outline BBs have longest dimension < 0.5xht
893  *************************************************************************/
894 
895 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
896  int word_len = word->reject_map.length ();
897  float rating_per_ch;
898  TBOX box; //BB of word
899 
900  if (word->unlv_crunch_mode == CR_NONE) {
901  delete_mode = 0;
902  return CR_NONE;
903  }
904 
905  if (word_len == 0) {
906  delete_mode = 1;
907  return CR_DELETE;
908  }
909 
910  if (word->rebuild_word != nullptr) {
911  // Cube leaves rebuild_word nullptr.
912  box = word->rebuild_word->bounding_box();
913  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
914  delete_mode = 4;
915  return CR_DELETE;
916  }
917 
918  if (noise_outlines(word->rebuild_word)) {
919  delete_mode = 5;
920  return CR_DELETE;
921  }
922  }
923 
924  if ((failure_count (word) * 1.5) > word_len) {
925  delete_mode = 2;
926  return CR_LOOSE_SPACE;
927  }
928 
929  if (word->best_choice->certainty () < crunch_del_cert) {
930  delete_mode = 7;
931  return CR_LOOSE_SPACE;
932  }
933 
934  rating_per_ch = word->best_choice->rating () / word_len;
935 
936  if (rating_per_ch > crunch_del_rating) {
937  delete_mode = 8;
938  return CR_LOOSE_SPACE;
939  }
940 
942  delete_mode = 9;
943  return CR_LOOSE_SPACE;
944  }
945 
946  if (box.bottom () >
948  delete_mode = 10;
949  return CR_LOOSE_SPACE;
950  }
951 
952  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
953  delete_mode = 11;
954  return CR_LOOSE_SPACE;
955  }
956 
957  if (box.width () < crunch_del_min_width * kBlnXHeight) {
958  delete_mode = 3;
959  return CR_LOOSE_SPACE;
960  }
961 
962  delete_mode = 0;
963  return CR_NONE;
964 }
965 
967  const char *str = word->best_choice->unichar_string().string();
968  int tess_rejs = 0;
969 
970  for (; *str != '\0'; str++) {
971  if (*str == ' ')
972  tess_rejs++;
973  }
974  return tess_rejs;
975 }
976 
977 
979  TBOX box; // BB of outline
980  int16_t outline_count = 0;
981  int16_t small_outline_count = 0;
982  int16_t max_dimension;
983  float small_limit = kBlnXHeight * crunch_small_outlines_size;
984 
985  for (int b = 0; b < word->NumBlobs(); ++b) {
986  TBLOB* blob = word->blobs[b];
987  for (TESSLINE* ol = blob->outlines; ol != nullptr; ol = ol->next) {
988  outline_count++;
989  box = ol->bounding_box();
990  if (box.height() > box.width())
991  max_dimension = box.height();
992  else
993  max_dimension = box.width();
994  if (max_dimension < small_limit)
995  small_outline_count++;
996  }
997  }
998  return small_outline_count >= outline_count;
999 }
1000 
1001 } // namespace tesseract
BLOCK_RES * block() const
Definition: pageres.h:757
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:966
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:45
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493
void rej_word_block_rej()
Definition: rejctmap.cpp:435
TWERD * rebuild_word
Definition: pageres.h:260
TESSLINE * next
Definition: blobs.h:265
int UNICHAR_ID
Definition: unichar.h:35
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:190
Definition: blobs.h:402
int32_t rej_count
Definition: pageres.h:80
ROW_RES * row() const
Definition: pageres.h:754
tesseract::BoxWord * bln_boxes
Definition: pageres.h:198
int32_t whole_word_rej_count
Definition: pageres.h:146
uint32_t unsigned_size() const
Definition: strngs.h:71
REJMAP reject_map
Definition: pageres.h:287
void rej_word_row_rej()
Definition: rejctmap.cpp:444
const char * string() const
Definition: strngs.cpp:196
uint8_t permuter() const
Definition: ratngs.h:346
int32_t char_count
Definition: pageres.h:118
TBOX bounding_box() const
Definition: blobs.cpp:871
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:504
Definition: rect.h:34
bool quality_recoverable_rejects()
Definition: rejctmap.cpp:302
int NumBlobs() const
Definition: blobs.h:432
int32_t length() const
Definition: rejctmap.h:223
Definition: werd.h:35
const int kBlnXHeight
Definition: normalis.h:24
void merge_tess_fails()
Definition: pageres.cpp:1073
double tessedit_reject_doc_percent
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
float rating() const
Definition: ratngs.h:327
double tessedit_reject_block_percent
void rej_word_doc_rej()
Definition: rejctmap.cpp:426
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
float certainty() const
Definition: ratngs.h:330
const int kBlnBaselineOffset
Definition: normalis.h:25
bool crunch_early_convert_bad_unlv_chs
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:978
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:418
uint8_t space()
Definition: werd.h:102
double tessedit_reject_row_percent
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:660
bool tessedit_preserve_blk_rej_perfect_wds
DocQualCallbacks(WERD_RES *word0)
Definition: docqual.cpp:32
double tessedit_good_doc_still_rowrej_wd
int16_t word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:61
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:39
int16_t width() const
Definition: rect.h:115
void CountMatchingBlobs(int index)
Definition: docqual.cpp:35
WERD_RES * restart_page()
Definition: pageres.h:698
BLOCK * block
Definition: pageres.h:117
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:161
int index() const
Definition: pdblock.h:68
void rej_stat_word()
Definition: pageres.cpp:1674
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
const STRING & unichar_lengths() const
Definition: ratngs.h:548
int16_t top() const
Definition: rect.h:58
ROW_RES * prev_row() const
Definition: pageres.h:745
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
CRUNCH_MODE
Definition: pageres.h:159
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:310
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:612
double tessedit_whole_wd_rej_row_percent
int16_t reject_count()
Definition: rejctmap.h:229
int16_t accept_count()
Definition: rejctmap.cpp:281
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:542
bool tessedit_preserve_row_rej_perfect_wds
WERD_RES * word() const
Definition: pageres.h:751
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:127
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
Definition: docqual.h:32
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
unsigned char BOOL8
Definition: host.h:34
Definition: ocrrow.h:36
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: werd.h:34
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:73
int32_t char_count
Definition: pageres.h:79
bool IsText() const
Definition: polyblk.h:49
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1764
int NumOutlines() const
Definition: blobs.cpp:464
PAGE_RES * page_res
Definition: pageres.h:677
GARBAGE_LEVEL
Definition: docqual.h:29
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
bool reject_spaces
Definition: pageres.h:336
bool rejected
Definition: pageres.h:82
Definition: strngs.h:45
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:233
int32_t rej_count
Definition: pageres.h:119
const UNICHARSET * uch_set
Definition: pageres.h:206
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:590
const STRING & unichar_string() const
Definition: ratngs.h:541
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:139
WERD_RES * forward()
Definition: pageres.h:731
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
Definition: docqual.cpp:680
int32_t char_count
Definition: pageres.h:144
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:407
Definition: blobs.h:268
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:895
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:93
int32_t rej_count
Definition: pageres.h:145
int16_t bottom() const
Definition: rect.h:65
TESSLINE * outlines
Definition: blobs.h:384
Unacceptable word.
Definition: control.h:30
PDBLK pdblk
Definition: ocrblock.h:192
int32_t length() const
Definition: strngs.cpp:191
WERD_CHOICE * best_choice
Definition: pageres.h:235
int16_t height() const
Definition: rect.h:108
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:116
ROW * row
Definition: pageres.h:143
WERD * word
Definition: pageres.h:189