tesseract  5.0.0-alpha-619-ge9db
docqual.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: docqual.cpp (Formerly docqual.c)
3  * Description: Document Quality Metrics
4  * Author: Phil Cheatle
5  *
6  * (C) Copyright 1994, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include <cctype>
20 #include "docqual.h"
21 #include "reject.h"
22 #include "tessvars.h"
23 #include "tesseractclass.h"
24 
25 namespace tesseract{
26 
27 static void countMatchingBlobs(int16_t& match_count, int /*index*/) {
28  ++match_count;
29 }
30 
31 static void countAcceptedBlobs(WERD_RES* word, int16_t& match_count,
32  int16_t& accepted_match_count, int index) {
33  if (word->reject_map[index].accepted()) {
34  ++accepted_match_count;
35  }
36  ++match_count;
37 }
38 
39 static void acceptIfGoodQuality(WERD_RES* word, int index) {
40  if (word->reject_map[index].accept_if_good_quality()) {
41  word->reject_map[index].setrej_quality_accept();
42  }
43 }
44 
45 /*************************************************************************
46  * word_blob_quality()
47  * How many blobs in the box_word are identical to those of the inword?
48  * ASSUME blobs in both initial word and box_word are in ascending order of
49  * left hand blob edge.
50  *************************************************************************/
52  int16_t match_count = 0;
53  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
54  !word->rebuild_word->blobs.empty()) {
55  using namespace std::placeholders; // for _1
57  *word->rebuild_word,
58  std::bind(countMatchingBlobs, match_count, _1));
59  }
60  return match_count;
61 }
62 
64  int16_t i = 0;
65  int16_t err_count = 0;
66 
67  if (word->rebuild_word != nullptr) {
68  for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
69  TBLOB* blob = word->rebuild_word->blobs[b];
70  err_count += count_outline_errs(word->best_choice->unichar_string()[i],
71  blob->NumOutlines());
72  i++;
73  }
74  }
75  return err_count;
76 }
77 
78 /*************************************************************************
79  * word_char_quality()
80  * Combination of blob quality and outline quality - how many good chars are
81  * there? - I.e chars which pass the blob AND outline tests.
82  *************************************************************************/
83 void Tesseract::word_char_quality(WERD_RES* word, int16_t* match_count,
84  int16_t* accepted_match_count) {
85  *match_count = 0;
86  *accepted_match_count = 0;
87  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
88  !word->rebuild_word->blobs.empty()) {
89  using namespace std::placeholders; // for _1
91  *word->rebuild_word,
92  std::bind(countAcceptedBlobs,
93  word, *match_count, *accepted_match_count, _1));
94  }
95 }
96 
97 /*************************************************************************
98  * unrej_good_chs()
99  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
100  *************************************************************************/
102  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
103  word->rebuild_word->blobs.empty()) {
104  using namespace std::placeholders; // for _1
106  *word->rebuild_word, std::bind(acceptIfGoodQuality, word, _1));
107  }
108 }
109 
110 int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
111  int expected_outline_count;
112 
113  if (STRING (outlines_odd).contains (c))
114  return 0; // Don't use this char
115  else if (STRING (outlines_2).contains (c))
116  expected_outline_count = 2;
117  else
118  expected_outline_count = 1;
119  return abs (outline_count - expected_outline_count);
120 }
121 
123  bool good_quality_doc) {
124  if ((tessedit_good_quality_unrej && good_quality_doc))
125  unrej_good_quality_words(page_res_it);
126  doc_and_block_rejection(page_res_it, good_quality_doc);
127  if (unlv_tilde_crunching) {
128  tilde_crunch(page_res_it);
129  tilde_delete(page_res_it);
130  }
131 }
132 
133 /*************************************************************************
134  * unrej_good_quality_words()
135  * Accept potential rejects in words which pass the following checks:
136  * - Contains a potential reject
137  * - Word looks like a sensible alpha word.
138  * - Word segmentation is the same as the original image
139  * - All characters have the expected number of outlines
140  * NOTE - the rejection counts are recalculated after unrejection
141  * - CAN'T do it in a single pass without a bit of fiddling
142  * - keep it simple but inefficient
143  *************************************************************************/
144 void Tesseract::unrej_good_quality_words( //unreject potential
145  PAGE_RES_IT &page_res_it) {
146  WERD_RES *word;
147  ROW_RES *current_row;
148  BLOCK_RES *current_block;
149  int i;
150 
151  page_res_it.restart_page ();
152  while (page_res_it.word () != nullptr) {
153  check_debug_pt (page_res_it.word (), 100);
154  if (bland_unrej) {
155  word = page_res_it.word ();
156  for (i = 0; i < word->reject_map.length (); i++) {
157  if (word->reject_map[i].accept_if_good_quality ())
158  word->reject_map[i].setrej_quality_accept ();
159  }
160  page_res_it.forward ();
161  }
162  else if ((page_res_it.row ()->char_count > 0) &&
163  ((page_res_it.row ()->rej_count /
164  static_cast<float>(page_res_it.row ()->char_count)) <=
166  word = page_res_it.word ();
170  word->best_choice->unichar_string().c_str(),
171  word->best_choice->unichar_lengths().c_str())
172  != AC_UNACCEPTABLE)) {
173  unrej_good_chs(word);
174  }
175  page_res_it.forward ();
176  }
177  else {
178  // Skip to end of dodgy row.
179  current_row = page_res_it.row ();
180  while ((page_res_it.word () != nullptr) &&
181  (page_res_it.row () == current_row))
182  page_res_it.forward ();
183  }
184  check_debug_pt (page_res_it.word (), 110);
185  }
186  page_res_it.restart_page ();
187  page_res_it.page_res->char_count = 0;
188  page_res_it.page_res->rej_count = 0;
189  current_block = nullptr;
190  current_row = nullptr;
191  while (page_res_it.word () != nullptr) {
192  if (current_block != page_res_it.block ()) {
193  current_block = page_res_it.block ();
194  current_block->char_count = 0;
195  current_block->rej_count = 0;
196  }
197  if (current_row != page_res_it.row ()) {
198  current_row = page_res_it.row ();
199  current_row->char_count = 0;
200  current_row->rej_count = 0;
201  current_row->whole_word_rej_count = 0;
202  }
203  page_res_it.rej_stat_word ();
204  page_res_it.forward ();
205  }
206 }
207 
208 
209 /*************************************************************************
210  * doc_and_block_rejection()
211  *
212  * If the page has too many rejects - reject all of it.
213  * If any block has too many rejects - reject all words in the block
214  *************************************************************************/
215 
216 void Tesseract::doc_and_block_rejection( //reject big chunks
217  PAGE_RES_IT &page_res_it,
218  bool good_quality_doc) {
219  int16_t block_no = 0;
220  int16_t row_no = 0;
221  BLOCK_RES *current_block;
222  ROW_RES *current_row;
223 
224  bool rej_word;
225  bool prev_word_rejected;
226  int16_t char_quality = 0;
227  int16_t accepted_char_quality;
228 
229  if (page_res_it.page_res->rej_count * 100.0 /
231  reject_whole_page(page_res_it);
233  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
234  page_res_it.page_res->char_count,
235  page_res_it.page_res->rej_count);
236  }
237  } else {
239  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
240  page_res_it.page_res->char_count,
241  page_res_it.page_res->rej_count);
242  }
243 
244  /* Walk blocks testing for block rejection */
245 
246  page_res_it.restart_page();
247  WERD_RES* word;
248  while ((word = page_res_it.word()) != nullptr) {
249  current_block = page_res_it.block();
250  block_no = current_block->block->pdblk.index();
251  if (current_block->char_count > 0 &&
252  (current_block->rej_count * 100.0 / current_block->char_count) >
255  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
256  block_no, current_block->char_count,
257  current_block->rej_count);
258  }
259  prev_word_rejected = false;
260  while ((word = page_res_it.word()) != nullptr &&
261  (page_res_it.block() == current_block)) {
263  rej_word = word->reject_map.reject_count() > 0 ||
265  if (rej_word && tessedit_dont_blkrej_good_wds &&
268  *word->uch_set,
269  word->best_choice->unichar_string().c_str(),
270  word->best_choice->unichar_lengths().c_str()) !=
271  AC_UNACCEPTABLE) {
272  word_char_quality(word, &char_quality, &accepted_char_quality);
273  rej_word = char_quality != word->reject_map.length();
274  }
275  } else {
276  rej_word = true;
277  }
278  if (rej_word) {
279  /*
280  Reject spacing if both current and prev words are rejected.
281  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
282  generated more space errors.
283  */
285  prev_word_rejected &&
286  page_res_it.prev_row() == page_res_it.row() &&
287  word->word->space() == 1)
288  word->reject_spaces = true;
290  }
291  prev_word_rejected = rej_word;
292  page_res_it.forward();
293  }
294  } else {
296  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
297  block_no, page_res_it.block()->char_count,
298  page_res_it.block()->rej_count);
299  }
300 
301  /* Walk rows in block testing for row rejection */
302  row_no = 0;
303  while (page_res_it.word() != nullptr &&
304  page_res_it.block() == current_block) {
305  current_row = page_res_it.row();
306  row_no++;
307  /* Reject whole row if:
308  fraction of chars on row which are rejected exceed a limit AND
309  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
310  limit
311  */
312  if (current_row->char_count > 0 &&
313  (current_row->rej_count * 100.0 / current_row->char_count) >
315  (current_row->whole_word_rej_count * 100.0 /
316  current_row->rej_count) <
319  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
320  row_no, current_row->char_count,
321  current_row->rej_count);
322  }
323  prev_word_rejected = false;
324  while ((word = page_res_it.word()) != nullptr &&
325  page_res_it.row () == current_row) {
326  /* Preserve words on good docs unless they are mostly rejected*/
327  if (!tessedit_row_rej_good_docs && good_quality_doc) {
328  rej_word = word->reject_map.reject_count() /
329  static_cast<float>(word->reject_map.length()) >
332  /* Preserve perfect words anyway */
333  rej_word = word->reject_map.reject_count() > 0 ||
335  if (rej_word && tessedit_dont_rowrej_good_wds &&
338  word->best_choice->unichar_string().c_str(),
339  word->best_choice->unichar_lengths().c_str()) !=
340  AC_UNACCEPTABLE) {
341  word_char_quality(word, &char_quality,
342  &accepted_char_quality);
343  rej_word = char_quality != word->reject_map.length();
344  }
345  } else {
346  rej_word = true;
347  }
348  if (rej_word) {
349  /*
350  Reject spacing if both current and prev words are rejected.
351  NOTE - this is NOT restricted to FUZZY spaces. - When tried
352  this generated more space errors.
353  */
355  prev_word_rejected &&
356  page_res_it.prev_row() == page_res_it.row() &&
357  word->word->space () == 1)
358  word->reject_spaces = true;
360  }
361  prev_word_rejected = rej_word;
362  page_res_it.forward();
363  }
364  } else {
366  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
367  row_no, current_row->char_count, current_row->rej_count);
368  }
369  while (page_res_it.word() != nullptr &&
370  page_res_it.row() == current_row)
371  page_res_it.forward();
372  }
373  }
374  }
375  }
376  }
377 }
378 
379 } // namespace tesseract
380 
381 /*************************************************************************
382  * reject_whole_page()
383  * Don't believe any of it - set the reject map to 00..00 in all words
384  *
385  *************************************************************************/
386 
387 void reject_whole_page(PAGE_RES_IT &page_res_it) {
388  page_res_it.restart_page ();
389  while (page_res_it.word () != nullptr) {
390  page_res_it.word ()->reject_map.rej_word_doc_rej ();
391  page_res_it.forward ();
392  }
393  //whole page is rejected
394  page_res_it.page_res->rejected = true;
395 }
396 
397 namespace tesseract {
398 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
399  WERD_RES *word;
400  GARBAGE_LEVEL garbage_level;
401  PAGE_RES_IT copy_it;
402  bool prev_potential_marked = false;
403  bool found_terrible_word = false;
404  bool ok_dict_word;
405 
406  page_res_it.restart_page();
407  while (page_res_it.word() != nullptr) {
408  POLY_BLOCK* pb = page_res_it.block()->block->pdblk.poly_block();
409  if (pb != nullptr && !pb->IsText()) {
410  page_res_it.forward();
411  continue;
412  }
413  word = page_res_it.word();
414 
416  convert_bad_unlv_chs(word);
417 
419  word->merge_tess_fails();
420 
421  if (word->reject_map.accept_count () != 0) {
422  found_terrible_word = false;
423  //Forget earlier potential crunches
424  prev_potential_marked = false;
425  }
426  else {
427  ok_dict_word = safe_dict_word(word);
428  garbage_level = garbage_word(word, ok_dict_word);
429 
430  if ((garbage_level != G_NEVER_CRUNCH) &&
431  (terrible_word_crunch (word, garbage_level))) {
432  if (crunch_debug > 0) {
433  tprintf ("T CRUNCHING: \"%s\"\n",
434  word->best_choice->unichar_string().c_str());
435  }
437  if (prev_potential_marked) {
438  while (copy_it.word () != word) {
439  if (crunch_debug > 0) {
440  tprintf ("P1 CRUNCHING: \"%s\"\n",
441  copy_it.word()->best_choice->unichar_string().c_str());
442  }
443  copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
444  copy_it.forward ();
445  }
446  prev_potential_marked = false;
447  }
448  found_terrible_word = true;
449  }
450  else if ((garbage_level != G_NEVER_CRUNCH) &&
451  (potential_word_crunch (word,
452  garbage_level, ok_dict_word))) {
453  if (found_terrible_word) {
454  if (crunch_debug > 0) {
455  tprintf ("P2 CRUNCHING: \"%s\"\n",
456  word->best_choice->unichar_string().c_str());
457  }
459  }
460  else if (!prev_potential_marked) {
461  copy_it = page_res_it;
462  prev_potential_marked = true;
463  if (crunch_debug > 1) {
464  tprintf ("P3 CRUNCHING: \"%s\"\n",
465  word->best_choice->unichar_string().c_str());
466  }
467  }
468  }
469  else {
470  found_terrible_word = false;
471  //Forget earlier potential crunches
472  prev_potential_marked = false;
473  if (crunch_debug > 2) {
474  tprintf ("NO CRUNCH: \"%s\"\n",
475  word->best_choice->unichar_string().c_str());
476  }
477  }
478  }
479  page_res_it.forward ();
480  }
481 }
482 
483 
485  GARBAGE_LEVEL garbage_level) {
486  float rating_per_ch;
487  int adjusted_len;
488  int crunch_mode = 0;
489 
490  if ((word->best_choice->unichar_string().length() == 0) ||
491  (strspn(word->best_choice->unichar_string().c_str(), " ") ==
493  crunch_mode = 1;
494  else {
495  adjusted_len = word->reject_map.length ();
496  if (adjusted_len > crunch_rating_max)
497  adjusted_len = crunch_rating_max;
498  rating_per_ch = word->best_choice->rating () / adjusted_len;
499 
500  if (rating_per_ch > crunch_terrible_rating)
501  crunch_mode = 2;
502  else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
503  crunch_mode = 3;
504  else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
505  (garbage_level != G_OK))
506  crunch_mode = 4;
507  else if ((rating_per_ch > crunch_poor_garbage_rate) &&
508  (garbage_level != G_OK))
509  crunch_mode = 5;
510  }
511  if (crunch_mode > 0) {
512  if (crunch_debug > 2) {
513  tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
514  crunch_mode, word->best_choice->unichar_string().c_str());
515  }
516  return true;
517  }
518  else
519  return false;
520 }
521 
523  GARBAGE_LEVEL garbage_level,
524  bool ok_dict_word) {
525  float rating_per_ch;
526  int adjusted_len;
527  const char *str = word->best_choice->unichar_string().c_str();
528  const char *lengths = word->best_choice->unichar_lengths().c_str();
529  bool word_crunchable;
530  int poor_indicator_count = 0;
531 
532  word_crunchable = !crunch_leave_accept_strings ||
533  word->reject_map.length() < 3 ||
535  str, lengths) == AC_UNACCEPTABLE &&
536  !ok_dict_word);
537 
538  adjusted_len = word->reject_map.length();
539  if (adjusted_len > 10)
540  adjusted_len = 10;
541  rating_per_ch = word->best_choice->rating() / adjusted_len;
542 
543  if (rating_per_ch > crunch_pot_poor_rate) {
544  if (crunch_debug > 2) {
545  tprintf("Potential poor rating on \"%s\"\n",
546  word->best_choice->unichar_string().c_str());
547  }
548  poor_indicator_count++;
549  }
550 
551  if (word_crunchable &&
553  if (crunch_debug > 2) {
554  tprintf("Potential poor cert on \"%s\"\n",
555  word->best_choice->unichar_string().c_str());
556  }
557  poor_indicator_count++;
558  }
559 
560  if (garbage_level != G_OK) {
561  if (crunch_debug > 2) {
562  tprintf("Potential garbage on \"%s\"\n",
563  word->best_choice->unichar_string().c_str());
564  }
565  poor_indicator_count++;
566  }
567  return poor_indicator_count >= crunch_pot_indicators;
568 }
569 
570 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
571  WERD_RES *word;
572  PAGE_RES_IT copy_it;
573  bool deleting_from_bol = false;
574  bool marked_delete_point = false;
575  int16_t debug_delete_mode;
576  CRUNCH_MODE delete_mode;
577  int16_t x_debug_delete_mode;
578  CRUNCH_MODE x_delete_mode;
579 
580  page_res_it.restart_page();
581  while (page_res_it.word() != nullptr) {
582  word = page_res_it.word();
583 
584  delete_mode = word_deletable (word, debug_delete_mode);
585  if (delete_mode != CR_NONE) {
586  if (word->word->flag (W_BOL) || deleting_from_bol) {
587  if (crunch_debug > 0) {
588  tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
589  debug_delete_mode,
590  word->best_choice->unichar_string().c_str());
591  }
592  word->unlv_crunch_mode = delete_mode;
593  deleting_from_bol = true;
594  } else if (word->word->flag(W_EOL)) {
595  if (marked_delete_point) {
596  while (copy_it.word() != word) {
597  x_delete_mode = word_deletable (copy_it.word (),
598  x_debug_delete_mode);
599  if (crunch_debug > 0) {
600  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
601  x_debug_delete_mode,
602  copy_it.word()->best_choice->unichar_string().c_str());
603  }
604  copy_it.word ()->unlv_crunch_mode = x_delete_mode;
605  copy_it.forward ();
606  }
607  }
608  if (crunch_debug > 0) {
609  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
610  debug_delete_mode,
611  word->best_choice->unichar_string().c_str());
612  }
613  word->unlv_crunch_mode = delete_mode;
614  deleting_from_bol = false;
615  marked_delete_point = false;
616  }
617  else {
618  if (!marked_delete_point) {
619  copy_it = page_res_it;
620  marked_delete_point = true;
621  }
622  }
623  }
624  else {
625  deleting_from_bol = false;
626  //Forget earlier potential crunches
627  marked_delete_point = false;
628  }
629  /*
630  The following step has been left till now as the tess fails are used to
631  determine if the word is deletable.
632  */
634  word->merge_tess_fails();
635  page_res_it.forward ();
636  }
637 }
638 
639 
641  int i;
642  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
643  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
644  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
645  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
646  for (i = 0; i < word_res->reject_map.length(); ++i) {
647  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
648  word_res->best_choice->set_unichar_id(unichar_dash, i);
649  if (word_res->reject_map[i].accepted ())
650  word_res->reject_map[i].setrej_unlv_rej ();
651  }
652  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
653  word_res->best_choice->set_unichar_id(unichar_space, i);
654  if (word_res->reject_map[i].accepted ())
655  word_res->reject_map[i].setrej_unlv_rej ();
656  }
657  }
658 }
659 
660 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
661  enum STATES
662  {
663  JUNK,
664  FIRST_UPPER,
665  FIRST_LOWER,
666  FIRST_NUM,
667  SUBSEQUENT_UPPER,
668  SUBSEQUENT_LOWER,
669  SUBSEQUENT_NUM
670  };
671  const char *str = word->best_choice->unichar_string().c_str();
672  const char *lengths = word->best_choice->unichar_lengths().c_str();
673  STATES state = JUNK;
674  int len = 0;
675  int isolated_digits = 0;
676  int isolated_alphas = 0;
677  int bad_char_count = 0;
678  int tess_rejs = 0;
679  int dodgy_chars = 0;
680  int ok_chars;
681  UNICHAR_ID last_char = -1;
682  int alpha_repetition_count = 0;
683  int longest_alpha_repetition_count = 0;
684  int longest_lower_run_len = 0;
685  int lower_string_count = 0;
686  int longest_upper_run_len = 0;
687  int upper_string_count = 0;
688  int total_alpha_count = 0;
689  int total_digit_count = 0;
690 
691  for (; *str != '\0'; str += *(lengths++)) {
692  len++;
693  if (word->uch_set->get_isupper (str, *lengths)) {
694  total_alpha_count++;
695  switch (state) {
696  case SUBSEQUENT_UPPER:
697  case FIRST_UPPER:
698  state = SUBSEQUENT_UPPER;
699  upper_string_count++;
700  if (longest_upper_run_len < upper_string_count)
701  longest_upper_run_len = upper_string_count;
702  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
703  alpha_repetition_count++;
704  if (longest_alpha_repetition_count < alpha_repetition_count) {
705  longest_alpha_repetition_count = alpha_repetition_count;
706  }
707  }
708  else {
709  last_char = word->uch_set->unichar_to_id(str, *lengths);
710  alpha_repetition_count = 1;
711  }
712  break;
713  case FIRST_NUM:
714  isolated_digits++;
715  // Fall through.
716  default:
717  state = FIRST_UPPER;
718  last_char = word->uch_set->unichar_to_id(str, *lengths);
719  alpha_repetition_count = 1;
720  upper_string_count = 1;
721  break;
722  }
723  }
724  else if (word->uch_set->get_islower (str, *lengths)) {
725  total_alpha_count++;
726  switch (state) {
727  case SUBSEQUENT_LOWER:
728  case FIRST_LOWER:
729  state = SUBSEQUENT_LOWER;
730  lower_string_count++;
731  if (longest_lower_run_len < lower_string_count)
732  longest_lower_run_len = lower_string_count;
733  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
734  alpha_repetition_count++;
735  if (longest_alpha_repetition_count < alpha_repetition_count) {
736  longest_alpha_repetition_count = alpha_repetition_count;
737  }
738  }
739  else {
740  last_char = word->uch_set->unichar_to_id(str, *lengths);
741  alpha_repetition_count = 1;
742  }
743  break;
744  case FIRST_NUM:
745  isolated_digits++;
746  // Fall through.
747  default:
748  state = FIRST_LOWER;
749  last_char = word->uch_set->unichar_to_id(str, *lengths);
750  alpha_repetition_count = 1;
751  lower_string_count = 1;
752  break;
753  }
754  }
755  else if (word->uch_set->get_isdigit (str, *lengths)) {
756  total_digit_count++;
757  switch (state) {
758  case FIRST_NUM:
759  state = SUBSEQUENT_NUM;
760  case SUBSEQUENT_NUM:
761  break;
762  case FIRST_UPPER:
763  case FIRST_LOWER:
764  isolated_alphas++;
765  // Fall through.
766  default:
767  state = FIRST_NUM;
768  break;
769  }
770  }
771  else {
772  if (*lengths == 1 && *str == ' ')
773  tess_rejs++;
774  else
775  bad_char_count++;
776  switch (state) {
777  case FIRST_NUM:
778  isolated_digits++;
779  break;
780  case FIRST_UPPER:
781  case FIRST_LOWER:
782  isolated_alphas++;
783  default:
784  break;
785  }
786  state = JUNK;
787  }
788  }
789 
790  switch (state) {
791  case FIRST_NUM:
792  isolated_digits++;
793  break;
794  case FIRST_UPPER:
795  case FIRST_LOWER:
796  isolated_alphas++;
797  default:
798  break;
799  }
800 
802  total_alpha_count += total_digit_count - isolated_digits;
803  }
804 
805  if (crunch_leave_ok_strings && len >= 4 &&
806  2 * (total_alpha_count - isolated_alphas) > len &&
807  longest_alpha_repetition_count < crunch_long_repetitions) {
808  if ((crunch_accept_ok &&
809  acceptable_word_string(*word->uch_set, str, lengths) !=
810  AC_UNACCEPTABLE) ||
811  longest_lower_run_len > crunch_leave_lc_strings ||
812  longest_upper_run_len > crunch_leave_uc_strings)
813  return G_NEVER_CRUNCH;
814  }
815  if (word->reject_map.length() > 1 &&
816  strpbrk(str, " ") == nullptr &&
817  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
818  word->best_choice->permuter() == FREQ_DAWG_PERM ||
819  word->best_choice->permuter() == USER_DAWG_PERM ||
820  word->best_choice->permuter() == NUMBER_PERM ||
821  acceptable_word_string(*word->uch_set, str, lengths) !=
822  AC_UNACCEPTABLE || ok_dict_word))
823  return G_OK;
824 
825  ok_chars = len - bad_char_count - isolated_digits -
826  isolated_alphas - tess_rejs;
827 
828  if (crunch_debug > 3) {
829  tprintf("garbage_word: \"%s\"\n",
830  word->best_choice->unichar_string().c_str());
831  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
832  len,
833  bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
834  }
835  if (bad_char_count == 0 &&
836  tess_rejs == 0 &&
837  (len > isolated_digits + isolated_alphas || len <= 2))
838  return G_OK;
839 
840  if (tess_rejs > ok_chars ||
841  (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
842  return G_TERRIBLE;
843 
844  if (len > 4) {
845  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
846  isolated_alphas;
847  if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5)
848  return G_DODGY;
849  else
850  return G_OK;
851  } else {
852  dodgy_chars = 2 * tess_rejs + bad_char_count;
853  if ((len == 4 && dodgy_chars > 2) ||
854  (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
855  return G_DODGY;
856  else
857  return G_OK;
858  }
859 }
860 
861 
862 /*************************************************************************
863  * word_deletable()
864  * DELETE WERDS AT ENDS OF ROWS IF
865  * Word is crunched &&
866  * ( string length = 0 OR
867  * > 50% of chars are "|" (before merging) OR
868  * certainty < -10 OR
869  * rating /char > 60 OR
870  * TOP of word is more than 0.5 xht BELOW baseline OR
871  * BOTTOM of word is more than 0.5 xht ABOVE xht OR
872  * length of word < 3xht OR
873  * height of word < 0.7 xht OR
874  * height of word > 3.0 xht OR
875  * >75% of the outline BBs have longest dimension < 0.5xht
876  *************************************************************************/
877 
878 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
879  int word_len = word->reject_map.length ();
880  float rating_per_ch;
881  TBOX box; //BB of word
882 
883  if (word->unlv_crunch_mode == CR_NONE) {
884  delete_mode = 0;
885  return CR_NONE;
886  }
887 
888  if (word_len == 0) {
889  delete_mode = 1;
890  return CR_DELETE;
891  }
892 
893  if (word->rebuild_word != nullptr) {
894  // Cube leaves rebuild_word nullptr.
895  box = word->rebuild_word->bounding_box();
896  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
897  delete_mode = 4;
898  return CR_DELETE;
899  }
900 
901  if (noise_outlines(word->rebuild_word)) {
902  delete_mode = 5;
903  return CR_DELETE;
904  }
905  }
906 
907  if ((failure_count (word) * 1.5) > word_len) {
908  delete_mode = 2;
909  return CR_LOOSE_SPACE;
910  }
911 
912  if (word->best_choice->certainty () < crunch_del_cert) {
913  delete_mode = 7;
914  return CR_LOOSE_SPACE;
915  }
916 
917  rating_per_ch = word->best_choice->rating () / word_len;
918 
919  if (rating_per_ch > crunch_del_rating) {
920  delete_mode = 8;
921  return CR_LOOSE_SPACE;
922  }
923 
925  delete_mode = 9;
926  return CR_LOOSE_SPACE;
927  }
928 
929  if (box.bottom () >
931  delete_mode = 10;
932  return CR_LOOSE_SPACE;
933  }
934 
935  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
936  delete_mode = 11;
937  return CR_LOOSE_SPACE;
938  }
939 
940  if (box.width () < crunch_del_min_width * kBlnXHeight) {
941  delete_mode = 3;
942  return CR_LOOSE_SPACE;
943  }
944 
945  delete_mode = 0;
946  return CR_NONE;
947 }
948 
949 int16_t Tesseract::failure_count(WERD_RES *word) {
950  const char *str = word->best_choice->unichar_string().c_str();
951  int tess_rejs = 0;
952 
953  for (; *str != '\0'; str++) {
954  if (*str == ' ')
955  tess_rejs++;
956  }
957  return tess_rejs;
958 }
959 
960 
961 bool Tesseract::noise_outlines(TWERD* word) {
962  TBOX box; // BB of outline
963  int16_t outline_count = 0;
964  int16_t small_outline_count = 0;
965  int16_t max_dimension;
966  float small_limit = kBlnXHeight * crunch_small_outlines_size;
967 
968  for (int b = 0; b < word->NumBlobs(); ++b) {
969  TBLOB* blob = word->blobs[b];
970  for (TESSLINE* ol = blob->outlines; ol != nullptr; ol = ol->next) {
971  outline_count++;
972  box = ol->bounding_box();
973  if (box.height() > box.width())
974  max_dimension = box.height();
975  else
976  max_dimension = box.width();
977  if (max_dimension < small_limit)
978  small_outline_count++;
979  }
980  }
981  return small_outline_count >= outline_count;
982 }
983 
984 } // namespace tesseract
tesseract::Tesseract::tessedit_dont_rowrej_good_wds
bool tessedit_dont_rowrej_good_wds
Definition: tesseractclass.h:917
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
REJMAP::rej_word_row_rej
void rej_word_row_rej()
Definition: rejctmap.cpp:441
G_NEVER_CRUNCH
Definition: docqual.h:29
tesseract::Tesseract::quality_based_rejection
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:133
BLOCK_RES::rej_count
int32_t rej_count
Definition: pageres.h:115
tesseract::Tesseract::crunch_terrible_rating
double crunch_terrible_rating
Definition: tesseractclass.h:936
PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:728
tesseract::Tesseract::tessedit_use_reject_spaces
bool tessedit_use_reject_spaces
Definition: tesseractclass.h:900
tesseract::Tesseract::crunch_del_rating
double crunch_del_rating
Definition: tesseractclass.h:942
UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:488
tesseract::Tesseract::failure_count
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:946
BLOCK_RES::char_count
int32_t char_count
Definition: pageres.h:114
REJMAP::rej_word_block_rej
void rej_word_block_rej()
Definition: rejctmap.cpp:432
CR_DELETE
Definition: pageres.h:156
tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd
double tessedit_good_doc_still_rowrej_wd
Definition: tesseractclass.h:923
tesseract::Tesseract::crunch_poor_garbage_cert
double crunch_poor_garbage_cert
Definition: tesseractclass.h:938
tessvars.h
WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:116
ROW_RES::rej_count
int32_t rej_count
Definition: pageres.h:138
tesseract::Tesseract::crunch_del_cert
double crunch_del_cert
Definition: tesseractclass.h:943
tesseract::Tesseract::crunch_rating_max
int crunch_rating_max
Definition: tesseractclass.h:950
kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:23
REJMAP::accept_count
int16_t accept_count()
Definition: rejctmap.cpp:278
reject_whole_page
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:385
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:62
tesseract::Tesseract::noise_outlines
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:958
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:260
G_TERRIBLE
Definition: docqual.h:32
TWERD
Definition: blobs.h:416
TBLOB::NumOutlines
int NumOutlines() const
Definition: blobs.cpp:453
tesseractclass.h
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
WERD_RES::bln_boxes
tesseract::BoxWord * bln_boxes
Definition: pageres.h:189
TBLOB::outlines
TESSLINE * outlines
Definition: blobs.h:398
PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:754
tesseract::Tesseract::tilde_crunch
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:396
tesseract::Tesseract::crunch_terrible_garbage
bool crunch_terrible_garbage
Definition: tesseractclass.h:937
PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:751
WERD_RES::unlv_crunch_mode
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:309
CR_NONE
Definition: pageres.h:153
TESSLINE
Definition: blobs.h:201
PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:695
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
TBOX::top
int16_t top() const
Definition: rect.h:57
tesseract::Tesseract::crunch_leave_lc_strings
int crunch_leave_lc_strings
Definition: tesseractclass.h:958
STRING
Definition: strngs.h:45
tesseract::Tesseract::crunch_include_numerals
bool crunch_include_numerals
Definition: tesseractclass.h:956
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
PDBLK::index
int index() const
Definition: pdblock.h:66
WERD_RES
Definition: pageres.h:160
WERD_RES::merge_tess_fails
void merge_tess_fails()
Definition: pageres.cpp:1062
PAGE_RES_IT::prev_row
ROW_RES * prev_row() const
Definition: pageres.h:742
tesseract::Tesseract::bland_unrej
bool bland_unrej
Definition: tesseractclass.h:928
tesseract::Tesseract::quality_rowrej_pc
double quality_rowrej_pc
Definition: tesseractclass.h:929
TESSLINE::next
TESSLINE * next
Definition: blobs.h:279
tesseract::Tesseract::crunch_leave_uc_strings
int crunch_leave_uc_strings
Definition: tesseractclass.h:960
tesseract::Tesseract::acceptable_word_string
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1744
BLOCK_RES
Definition: pageres.h:110
tesseract::Tesseract::crunch_pot_poor_cert
double crunch_pot_poor_cert
Definition: tesseractclass.h:941
tesseract::Tesseract::tessedit_row_rej_good_docs
bool tessedit_row_rej_good_docs
Definition: tesseractclass.h:921
tesseract::Tesseract::tessedit_preserve_min_wd_len
int tessedit_preserve_min_wd_len
Definition: tesseractclass.h:919
TBOX::height
int16_t height() const
Definition: rect.h:107
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
tesseract::Tesseract::crunch_pot_indicators
int crunch_pot_indicators
Definition: tesseractclass.h:951
ROW_RES::char_count
int32_t char_count
Definition: pageres.h:137
tesseract::Tesseract::crunch_early_convert_bad_unlv_chs
bool crunch_early_convert_bad_unlv_chs
Definition: tesseractclass.h:935
tesseract::Tesseract::crunch_small_outlines_size
double crunch_small_outlines_size
Definition: tesseractclass.h:949
tesseract::Tesseract::crunch_del_max_ht
double crunch_del_max_ht
Definition: tesseractclass.h:945
tesseract::Tesseract::crunch_leave_ok_strings
bool crunch_leave_ok_strings
Definition: tesseractclass.h:952
BLOCK::pdblk
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:189
REJMAP::length
int32_t length() const
Definition: rejctmap.h:222
WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:347
PAGE_RES::char_count
int32_t char_count
Definition: pageres.h:76
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
tesseract::Tesseract::safe_dict_word
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:605
WERD::space
uint8_t space()
Definition: werd.h:98
GARBAGE_LEVEL
GARBAGE_LEVEL
Definition: docqual.h:27
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::BoxWord::ProcessMatchedBlobs
void ProcessMatchedBlobs(const TWERD &other, std::function< void(int)> cb) const
Definition: boxword.cpp:190
PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:54
tesseract::Tesseract::crunch_leave_accept_strings
bool crunch_leave_accept_strings
Definition: tesseractclass.h:955
tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds
bool tessedit_preserve_row_rej_perfect_wds
Definition: tesseractclass.h:913
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
W_EOL
end of line
Definition: werd.h:47
ROW_RES::whole_word_rej_count
int32_t whole_word_rej_count
Definition: pageres.h:139
tesseract::Tesseract::word_char_quality
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:95
tesseract::Tesseract::outlines_2
char * outlines_2
Definition: tesseractclass.h:897
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:457
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
TBOX::width
int16_t width() const
Definition: rect.h:114
tesseract::Tesseract::crunch_del_low_word
double crunch_del_low_word
Definition: tesseractclass.h:948
PAGE_RES::rej_count
int32_t rej_count
Definition: pageres.h:77
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
tesseract::Tesseract::tessedit_whole_wd_rej_row_percent
double tessedit_whole_wd_rej_row_percent
Definition: tesseractclass.h:909
tesseract::Tesseract::crunch_long_repetitions
int crunch_long_repetitions
Definition: tesseractclass.h:961
tesseract::Tesseract::crunch_poor_garbage_rate
double crunch_poor_garbage_rate
Definition: tesseractclass.h:939
tesseract::Tesseract::tessedit_good_quality_unrej
bool tessedit_good_quality_unrej
Definition: tesseractclass.h:899
REJMAP::reject_count
int16_t reject_count()
Definition: rejctmap.h:228
tesseract::Tesseract::tessedit_dont_blkrej_good_wds
bool tessedit_dont_blkrej_good_wds
Definition: tesseractclass.h:915
tesseract::Tesseract::garbage_word
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
Definition: docqual.cpp:658
WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:288
tesseract
Definition: baseapi.h:65
tesseract::Tesseract::crunch_early_merge_tess_fails
bool crunch_early_merge_tess_fails
Definition: tesseractclass.h:934
PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:748
tesseract::Tesseract::tessedit_debug_doc_rejection
bool tessedit_debug_doc_rejection
Definition: tesseractclass.h:925
PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1658
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
tesseract::Tesseract::potential_word_crunch
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:520
tesseract::Tesseract::crunch_del_min_width
double crunch_del_min_width
Definition: tesseractclass.h:946
tesseract::Tesseract::crunch_del_min_ht
double crunch_del_min_ht
Definition: tesseractclass.h:944
PAGE_RES_IT
Definition: pageres.h:668
tesseract::Tesseract::terrible_word_crunch
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:482
reject.h
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:495
AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29
PAGE_RES::rejected
bool rejected
Definition: pageres.h:79
tesseract::Tesseract::outlines_odd
char * outlines_odd
Definition: tesseractclass.h:896
STRING::length
int32_t length() const
Definition: strngs.cpp:187
REJMAP::rej_word_doc_rej
void rej_word_doc_rej()
Definition: rejctmap.cpp:423
tesseract::Tesseract::count_outline_errs
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:121
TBLOB
Definition: blobs.h:282
ROW_RES
Definition: pageres.h:133
BLOCK_RES::block
BLOCK * block
Definition: pageres.h:113
tesseract::Tesseract::word_deletable
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:875
tesseract::Tesseract::tessedit_reject_row_percent
double tessedit_reject_row_percent
Definition: tesseractclass.h:906
tesseract::Tesseract::convert_bad_unlv_chs
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:638
tesseract::Tesseract::tessedit_reject_doc_percent
double tessedit_reject_doc_percent
Definition: tesseractclass.h:902
tesseract::Tesseract::tilde_delete
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:568
tesseract::Tesseract::crunch_del_high_word
double crunch_del_high_word
Definition: tesseractclass.h:947
STRING::unsigned_size
uint32_t unsigned_size() const
Definition: strngs.h:72
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
CRUNCH_MODE
CRUNCH_MODE
Definition: pageres.h:150
POLY_BLOCK
Definition: polyblk.h:26
WERD_RES::reject_spaces
bool reject_spaces
Definition: pageres.h:335
tesseract::Tesseract::unrej_good_chs
void unrej_good_chs(WERD_RES *word)
Definition: docqual.cpp:112
REJMAP::quality_recoverable_rejects
bool quality_recoverable_rejects()
Definition: rejctmap.cpp:299
tesseract::Tesseract::tessedit_unrej_any_wd
bool tessedit_unrej_any_wd
Definition: tesseractclass.h:841
CR_LOOSE_SPACE
Definition: pageres.h:155
WERD_RES::word
WERD * word
Definition: pageres.h:180
TWERD::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:859
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
tesseract::Tesseract::doc_and_block_rejection
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:225
G_OK
Definition: docqual.h:30
tesseract::Tesseract::unlv_tilde_crunching
bool unlv_tilde_crunching
Definition: tesseractclass.h:930
tesseract::Tesseract::tessedit_reject_block_percent
double tessedit_reject_block_percent
Definition: tesseractclass.h:904
tesseract::Tesseract::unrej_good_quality_words
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:154
tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds
bool tessedit_preserve_blk_rej_perfect_wds
Definition: tesseractclass.h:911
G_DODGY
Definition: docqual.h:31
tesseract::Tesseract::tessedit_debug_block_rejection
bool tessedit_debug_block_rejection
Definition: tesseractclass.h:846
CR_KEEP_SPACE
Definition: pageres.h:154
tesseract::Tesseract::crunch_debug
int crunch_debug
Definition: tesseractclass.h:962
WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:536
PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:671
FREQ_DAWG_PERM
Definition: ratngs.h:242
tesseract::Tesseract::crunch_accept_ok
bool crunch_accept_ok
Definition: tesseractclass.h:953
kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:24
TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:446
tesseract::Tesseract::word_blob_quality
int16_t word_blob_quality(WERD_RES *word)
Definition: docqual.cpp:64
W_BOL
start of line
Definition: werd.h:46
NUMBER_PERM
Definition: ratngs.h:237
tesseract::Tesseract::crunch_pot_poor_rate
double crunch_pot_poor_rate
Definition: tesseractclass.h:940
USER_DAWG_PERM
Definition: ratngs.h:241
tesseract::Tesseract::word_outline_errs
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:76
tesseract::Tesseract::check_debug_pt
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1848
TBOX
Definition: rect.h:33
docqual.h