tesseract  4.0.0-1-g2a2b
output.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: output.cpp (Formerly output.c)
3  * Description: Output pass
4  * Author: Phil Cheatle
5  * Created: Thu Aug 4 10:56:08 BST 1994
6  *
7  * (C) Copyright 1994, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <cctype>
21 #include <cerrno>
22 #include <cstring>
23 #include "helpers.h"
24 #include "tessvars.h"
25 #include "control.h"
26 #include "reject.h"
27 #include "docqual.h"
28 #include "output.h"
29 #include "globals.h"
30 #include "tesseractclass.h"
31 
32 #define EPAPER_EXT ".ep"
33 #define PAGE_YSIZE 3508
34 #define CTRL_INSET '\024' //dc4=text inset
35 #define CTRL_FONT '\016' //so=font change
36 #define CTRL_DEFAULT '\017' //si=default font
37 #define CTRL_SHIFT '\022' //dc2=x shift
38 #define CTRL_TAB '\011' //tab
39 #define CTRL_NEWLINE '\012' //newline
40 #define CTRL_HARDLINE '\015' //cr
41 
42 namespace tesseract {
43 void Tesseract::output_pass( //Tess output pass //send to api
44  PAGE_RES_IT &page_res_it,
45  const TBOX *target_word_box) {
46  BLOCK_RES *block_of_last_word;
47  bool force_eol; //During output
48  BLOCK *nextblock; //block of next word
49  WERD *nextword; //next word
50 
51  page_res_it.restart_page ();
52  block_of_last_word = nullptr;
53  while (page_res_it.word () != nullptr) {
54  check_debug_pt (page_res_it.word (), 120);
55 
56  if (target_word_box) {
57  TBOX current_word_box = page_res_it.word()->word->bounding_box();
58  FCOORD center_pt(
59  (current_word_box.right() + current_word_box.left()) / 2,
60  (current_word_box.bottom() + current_word_box.top()) / 2);
61  if (!target_word_box->contains(center_pt)) {
62  page_res_it.forward();
63  continue;
64  }
65  }
67  block_of_last_word != page_res_it.block ()) {
68  block_of_last_word = page_res_it.block ();
69  }
70 
71  force_eol = (tessedit_write_block_separators &&
72  (page_res_it.block () != page_res_it.next_block ())) ||
73  (page_res_it.next_word () == nullptr);
74 
75  if (page_res_it.next_word () != nullptr)
76  nextword = page_res_it.next_word ()->word;
77  else
78  nextword = nullptr;
79  if (page_res_it.next_block () != nullptr)
80  nextblock = page_res_it.next_block ()->block;
81  else
82  nextblock = nullptr;
83  //regardless of tilde crunching
84  write_results(page_res_it,
85  determine_newline_type(page_res_it.word()->word,
86  page_res_it.block()->block,
87  nextword, nextblock), force_eol);
88  page_res_it.forward();
89  }
90 }
91 
92 
93 /*************************************************************************
94  * write_results()
95  *
96  * All recognition and rejection has now been done. Generate the following:
97  * .txt file - giving the final best choices with NO highlighting
98  * .raw file - giving the tesseract top choice output for each word
99  * .map file - showing how the .txt file has been rejected in the .ep file
100  * epchoice list - a list of one element per word, containing the text for the
101  * epaper. Reject strings are inserted.
102  * inset list - a list of bounding boxes of reject insets - indexed by the
103  * reject strings in the epchoice text.
104  *************************************************************************/
106  char newline_type, // type of newline
107  bool force_eol) { // override tilde crunch?
108  WERD_RES *word = page_res_it.word();
109  const UNICHARSET &uchset = *word->uch_set;
110  int i;
111  bool need_reject = false;
112  UNICHAR_ID space = uchset.unichar_to_id(" ");
113 
114  if ((word->unlv_crunch_mode != CR_NONE ||
115  word->best_choice->length() == 0) &&
117  if ((word->unlv_crunch_mode != CR_DELETE) &&
118  (!stats_.tilde_crunch_written ||
119  ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
120  (word->word->space () > 0) &&
121  !word->word->flag (W_FUZZY_NON) &&
122  !word->word->flag (W_FUZZY_SP)))) {
123  if (!word->word->flag (W_BOL) &&
124  (word->word->space () > 0) &&
125  !word->word->flag (W_FUZZY_NON) &&
126  !word->word->flag (W_FUZZY_SP)) {
127  stats_.last_char_was_tilde = false;
128  }
129  need_reject = true;
130  }
131  if ((need_reject && !stats_.last_char_was_tilde) ||
132  (force_eol && stats_.write_results_empty_block)) {
133  /* Write a reject char - mark as rejected unless zero_rejection mode */
134  stats_.last_char_was_tilde = TRUE;
135  stats_.tilde_crunch_written = true;
136  stats_.last_char_was_newline = false;
137  stats_.write_results_empty_block = false;
138  }
139 
140  if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
141  stats_.tilde_crunch_written = false;
142  stats_.last_char_was_newline = true;
143  stats_.last_char_was_tilde = false;
144  }
145 
146  if (force_eol)
147  stats_.write_results_empty_block = true;
148  return;
149  }
150 
151  /* NORMAL PROCESSING of non tilde crunched words */
152 
153  stats_.tilde_crunch_written = false;
154  if (newline_type)
155  stats_.last_char_was_newline = true;
156  else
157  stats_.last_char_was_newline = false;
158  stats_.write_results_empty_block = force_eol; // about to write a real word
159 
160  if (unlv_tilde_crunching &&
161  stats_.last_char_was_tilde &&
162  (word->word->space() == 0) &&
164  (word->best_choice->unichar_id(0) == space)) {
165  /* Prevent adjacent tilde across words - we know that adjacent tildes within
166  words have been removed */
167  word->MergeAdjacentBlobs(0);
168  }
169  if (newline_type ||
171  stats_.last_char_was_tilde = false;
172  else {
173  if (word->reject_map.length () > 0) {
174  if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
175  stats_.last_char_was_tilde = true;
176  else
177  stats_.last_char_was_tilde = false;
178  }
179  else if (word->word->space () > 0)
180  stats_.last_char_was_tilde = false;
181  /* else it is unchanged as there are no output chars */
182  }
183 
184  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
185 
186  set_unlv_suspects(word);
187  check_debug_pt (word, 120);
189  tprintf ("Dict word: \"%s\": %d\n",
190  word->best_choice->debug_string().string(),
191  dict_word(*(word->best_choice)));
192  }
193  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
195  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
196  for (i = 0; i < word->best_choice->length(); ++i) {
197  if (word->reject_map[i].rejected())
198  word->reject_map[i].setrej_minimal_rej_accept();
199  }
200  }
202  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
203  for (i = 0; i < word->best_choice->length(); ++i) {
204  if ((word->best_choice->unichar_id(i) != space) &&
205  word->reject_map[i].rejected())
206  word->reject_map[i].setrej_minimal_rej_accept();
207  }
208  }
209  }
210 }
211 } // namespace tesseract
212 
213 /**********************************************************************
214  * determine_newline_type
215  *
216  * Find whether we have a wrapping or hard newline.
217  * Return FALSE if not at end of line.
218  **********************************************************************/
219 
220 char determine_newline_type( //test line ends
221  WERD *word, //word to do
222  BLOCK *block, //current block
223  WERD *next_word, //next word
224  BLOCK *next_block //block of next word
225  ) {
226  int16_t end_gap; //to right edge
227  int16_t width; //of next word
228  TBOX word_box; //bounding
229  TBOX next_box; //next word
230  TBOX block_box; //block bounding
231 
232  if (!word->flag (W_EOL))
233  return FALSE; //not end of line
234  if (next_word == nullptr || next_block == nullptr || block != next_block)
235  return CTRL_NEWLINE;
236  if (next_word->space () > 0)
237  return CTRL_HARDLINE; //it is tabbed
238  word_box = word->bounding_box ();
239  next_box = next_word->bounding_box ();
240  block_box = block->pdblk.bounding_box ();
241  //gap to eol
242  end_gap = block_box.right () - word_box.right ();
243  end_gap -= (int32_t) block->space ();
244  width = next_box.right () - next_box.left ();
245  // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
246  // block_box.right(),word_box.right(),end_gap,
247  // next_box.right(),next_box.left(),width,
248  // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
249  return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
250 }
251 
252 /*************************************************************************
253  * get_rep_char()
254  * Return the first accepted character from the repetition string. This is the
255  * character which is repeated - as determined earlier by fix_rep_char()
256  *************************************************************************/
257 namespace tesseract {
258 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
259  int i;
260  for (i = 0; ((i < word->reject_map.length()) &&
261  (word->reject_map[i].rejected())); ++i);
262 
263  if (i < word->reject_map.length()) {
264  return word->best_choice->unichar_id(i);
265  } else {
266  return word->uch_set->unichar_to_id(unrecognised_char.string());
267  }
268 }
269 
270 /*************************************************************************
271  * SUSPECT LEVELS
272  *
273  * 0 - don't reject ANYTHING
274  * 1,2 - partial rejection
275  * 3 - BEST
276  *
277  * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
278  * tessedit_minimal_rejection.
279  *************************************************************************/
281  int len = word_res->reject_map.length();
282  const WERD_CHOICE &word = *(word_res->best_choice);
283  const UNICHARSET &uchset = *word.unicharset();
284  int i;
285  float rating_per_ch;
286 
287  if (suspect_level == 0) {
288  for (i = 0; i < len; i++) {
289  if (word_res->reject_map[i].rejected())
290  word_res->reject_map[i].setrej_minimal_rej_accept();
291  }
292  return;
293  }
294 
295  if (suspect_level >= 3)
296  return; //Use defaults
297 
298  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
299 
300  if (safe_dict_word(word_res) &&
301  (count_alphas(word) > suspect_short_words)) {
302  /* Unreject alphas in dictionary words */
303  for (i = 0; i < len; ++i) {
304  if (word_res->reject_map[i].rejected() &&
305  uchset.get_isalpha(word.unichar_id(i)))
306  word_res->reject_map[i].setrej_minimal_rej_accept();
307  }
308  }
309 
310  rating_per_ch = word.rating() / word_res->reject_map.length();
311 
312  if (rating_per_ch >= suspect_rating_per_ch)
313  return; // Don't touch bad ratings
314 
315  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
316  /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
317  for (i = 0; i < len; ++i) {
318  if (word_res->reject_map[i].rejected() &&
319  (!uchset.eq(word.unichar_id(i), " ")))
320  word_res->reject_map[i].setrej_minimal_rej_accept();
321  }
322  }
323 
324  for (i = 0; i < len; i++) {
325  if (word_res->reject_map[i].rejected()) {
326  if (word_res->reject_map[i].flag(R_DOC_REJ))
327  word_res->reject_map[i].setrej_minimal_rej_accept();
328  if (word_res->reject_map[i].flag(R_BLOCK_REJ))
329  word_res->reject_map[i].setrej_minimal_rej_accept();
330  if (word_res->reject_map[i].flag(R_ROW_REJ))
331  word_res->reject_map[i].setrej_minimal_rej_accept();
332  }
333  }
334 
335  if (suspect_level == 2)
336  return;
337 
338  if (!suspect_constrain_1Il ||
339  (word_res->reject_map.length() <= suspect_short_words)) {
340  for (i = 0; i < len; i++) {
341  if (word_res->reject_map[i].rejected()) {
342  if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
343  word_res->reject_map[i].flag(R_POSTNN_1IL)))
344  word_res->reject_map[i].setrej_minimal_rej_accept();
345 
346  if (!suspect_constrain_1Il &&
347  word_res->reject_map[i].flag(R_MM_REJECT))
348  word_res->reject_map[i].setrej_minimal_rej_accept();
349  }
350  }
351  }
352 
353  if (acceptable_word_string(*word_res->uch_set,
354  word.unichar_string().string(),
355  word.unichar_lengths().string()) !=
356  AC_UNACCEPTABLE ||
358  word.unichar_lengths().string())) {
359  if (word_res->reject_map.length() > suspect_short_words) {
360  for (i = 0; i < len; i++) {
361  if (word_res->reject_map[i].rejected() &&
362  (!word_res->reject_map[i].perm_rejected() ||
363  word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
364  word_res->reject_map[i].flag (R_POSTNN_1IL) ||
365  word_res->reject_map[i].flag (R_MM_REJECT))) {
366  word_res->reject_map[i].setrej_minimal_rej_accept();
367  }
368  }
369  }
370  }
371 }
372 
373 int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
374  int count = 0;
375  for (int i = 0; i < word.length(); ++i) {
376  if (word.unicharset()->get_isalpha(word.unichar_id(i)))
377  count++;
378  }
379  return count;
380 }
381 
382 
384  int count = 0;
385  for (int i = 0; i < word.length(); ++i) {
386  if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
387  word.unicharset()->get_isdigit(word.unichar_id(i)))
388  count++;
389  }
390  return count;
391 }
392 
393 
395  const char* lengths) {
396  bool prev_digit = false;
397 
398  if (*lengths == 1 && *s == '(')
399  s++;
400 
401  if (*lengths == 1 &&
402  ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
403  s++;
404 
405  for (; *s != '\0'; s += *(lengths++)) {
406  if (unicharset.get_isdigit(s, *lengths))
407  prev_digit = true;
408  else if (prev_digit &&
409  (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
410  prev_digit = false;
411  else if (prev_digit && *lengths == 1 &&
412  (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
413  return true;
414  else if (prev_digit &&
415  *lengths == 1 && (*s == '%') &&
416  (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
417  (*(s + *lengths + *(lengths + 1)) == '\0'))
418  return true;
419  else
420  return false;
421  }
422  return true;
423 }
424 } // namespace tesseract
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:280
int16_t space() const
return spacing
Definition: ocrblock.h:100
BLOCK_RES * block() const
Definition: pageres.h:757
bool acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:394
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:980
int UNICHAR_ID
Definition: unichar.h:35
#define TRUE
Definition: capi.h:51
#define CTRL_NEWLINE
Definition: output.cpp:39
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:129
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:686
REJMAP reject_map
Definition: pageres.h:287
const char * string() const
Definition: strngs.cpp:196
int count(LIST var_list)
Definition: oldlist.cpp:98
TBOX bounding_box() const
Definition: werd.cpp:159
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
Definition: rect.h:34
int32_t length() const
Definition: rejctmap.h:223
Definition: werd.h:35
BLOCK_RES * next_block() const
Definition: pageres.h:766
int16_t count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:373
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
float rating() const
Definition: ratngs.h:327
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
uint8_t space()
Definition: werd.h:102
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
Definition: output.cpp:105
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:220
WERD_RES * restart_page()
Definition: pageres.h:698
BLOCK * block
Definition: pageres.h:117
int16_t left() const
Definition: rect.h:72
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
const STRING & unichar_lengths() const
Definition: ratngs.h:548
int16_t top() const
Definition: rect.h:58
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:310
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:612
UNICHARSET unicharset
Definition: ccutil.h:68
WERD_RES * next_word() const
Definition: pageres.h:760
#define FALSE
Definition: capi.h:52
bool tess_accepted
Definition: pageres.h:296
#define CTRL_HARDLINE
Definition: output.cpp:40
WERD_RES * word() const
Definition: pageres.h:751
Definition: werd.h:59
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: werd.h:34
UNICHAR_ID get_rep_char(WERD_RES *word)
Definition: output.cpp:258
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1764
Definition: ocrblock.h:30
int length() const
Definition: ratngs.h:303
const STRING debug_string() const
Definition: ratngs.h:505
bool contains(const FCOORD pt) const
Definition: rect.h:333
const UNICHARSET * uch_set
Definition: pageres.h:206
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:60
Definition: points.h:189
const STRING & unichar_string() const
Definition: ratngs.h:541
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868
int16_t right() const
Definition: rect.h:79
WERD_RES * forward()
Definition: pageres.h:731
int16_t bottom() const
Definition: rect.h:65
int16_t count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:383
Unacceptable word.
Definition: control.h:30
PDBLK pdblk
Definition: ocrblock.h:192
WERD_CHOICE * best_choice
Definition: pageres.h:235
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:43
#define ASSERT_HOST(x)
Definition: errcode.h:84
WERD * word
Definition: pageres.h:189