tesseract  5.0.0-alpha-619-ge9db
ocrclass.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: ocrclass.h
3  * Description: Class definitions and constants for the OCR API.
4  * Author: Hewlett-Packard Co
5  *
6  * (C) Copyright 1996, Hewlett-Packard Co.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 /**********************************************************************
20  * This file contains typedefs for all the structures used by
21  * the HP OCR interface.
22  * The structures are designed to allow them to be used with any
23  * structure alignment up to 8.
24  **********************************************************************/
25 
26 #ifndef CCUTIL_OCRCLASS_H_
27 #define CCUTIL_OCRCLASS_H_
28 
29 #include <chrono>
30 #include <ctime>
31 
32 /**********************************************************************
33  * EANYCODE_CHAR
34  * Description of a single character. The character code is defined by
35  * the character set of the current font.
36  * Output text is sent as an array of these structures.
37  * Spaces and line endings in the output are represented in the
38  * structures of the surrounding characters. They are not directly
39  * represented as characters.
40  * The first character in a word has a positive value of blanks.
41  * Missing information should be set to the defaults in the comments.
42  * If word bounds are known, but not character bounds, then the top and
43  * bottom of each character should be those of the word. The left of the
44  * first and right of the last char in each word should be set. All other
45  * lefts and rights should be set to -1.
46  * If set, the values of right and bottom are left+width and top+height.
47  * Most of the members come directly from the parameters to ocr_append_char.
48  * The formatting member uses the enhancement parameter and combines the
49  * line direction stuff into the top 3 bits.
50  * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para,
51  * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what
52  * the coding is, only that it is backwards compatible with the previous
53  * version.
54  **********************************************************************/
55 
56 typedef struct { /*single character */
57  // It should be noted that the format for char_code for version 2.0 and beyond
58  // is UTF8 which means that ASCII characters will come out as one structure
59  // but other characters will be returned in two or more instances of this
60  // structure with a single byte of the UTF8 code in each, but each will have
61  // the same bounding box. Programs which want to handle languagues with
62  // different characters sets will need to handle extended characters
63  // appropriately, but *all* code needs to be prepared to receive UTF8 coded
64  // characters for characters such as bullet and fancy quotes.
65  uint16_t char_code; /*character itself */
66  int16_t left; /*of char (-1) */
67  int16_t right; /*of char (-1) */
68  int16_t top; /*of char (-1) */
69  int16_t bottom; /*of char (-1) */
70  int16_t font_index; /*what font (0) */
71  uint8_t confidence; /*0=perfect, 100=reject (0/100) */
72  uint8_t point_size; /*of char, 72=i inch, (10) */
73  int8_t blanks; /*no of spaces before this char (1) */
74  uint8_t formatting; /*char formatting (0) */
75 } EANYCODE_CHAR; /*single character */
76 
77 /**********************************************************************
78  * ETEXT_DESC
79  * Description of the output of the OCR engine.
80  * This structure is used as both a progress monitor and the final
81  * output header, since it needs to be a valid progress monitor while
82  * the OCR engine is storing its output to shared memory.
83  * During progress, all the buffer info is -1.
84  * Progress starts at 0 and increases to 100 during OCR. No other constraint.
85  * Additionally the progress callback contains the bounding box of the word that
86  * is currently being processed.
87  * Every progress callback, the OCR engine must set ocr_alive to 1.
88  * The HP side will set ocr_alive to 0. Repeated failure to reset
89  * to 1 indicates that the OCR engine is dead.
90  * If the cancel function is not null then it is called with the number of
91  * user words found. If it returns true then operation is cancelled.
92  **********************************************************************/
93 class ETEXT_DESC;
94 
95 using CANCEL_FUNC = bool (*)(void*, int);
96 using PROGRESS_FUNC = bool (*)(int, int, int, int, int);
97 using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC*, int, int, int, int);
98 
99 class ETEXT_DESC { // output header
100  public:
101  int16_t count{0};
102  int16_t progress{0};
103 
106  int8_t more_to_come{0};
107  volatile int8_t ocr_alive{0};
108  int8_t err_code{0};
109  CANCEL_FUNC cancel{nullptr};
111  nullptr};
113  void* cancel_this{nullptr};
114  std::chrono::steady_clock::time_point end_time;
117  EANYCODE_CHAR text[1]{};
118 
119  ETEXT_DESC() : progress_callback2(&default_progress_func) {
120  end_time = std::chrono::time_point<std::chrono::steady_clock,
121  std::chrono::milliseconds>();
122  }
123 
124  // Sets the end time to be deadline_msecs milliseconds from now.
125  void set_deadline_msecs(int32_t deadline_msecs) {
126  if (deadline_msecs > 0) {
127  end_time = std::chrono::steady_clock::now() +
128  std::chrono::milliseconds(deadline_msecs);
129  }
130  }
131 
132  // Returns false if we've not passed the end_time, or have not set a deadline.
133  bool deadline_exceeded() const {
134  if (end_time.time_since_epoch() ==
135  std::chrono::steady_clock::duration::zero()) {
136  return false;
137  }
138  auto now = std::chrono::steady_clock::now();
139  return (now > end_time);
140  }
141 
142  private:
143  static bool default_progress_func(ETEXT_DESC* ths, int left, int right,
144  int top, int bottom) {
145  if (ths->progress_callback != nullptr) {
146  return (*(ths->progress_callback))(ths->progress, left, right, top,
147  bottom);
148  }
149  return true;
150  }
151 };
152 
153 #endif // CCUTIL_OCRCLASS_H_
ETEXT_DESC::set_deadline_msecs
void set_deadline_msecs(int32_t deadline_msecs)
Definition: ocrclass.h:121
ETEXT_DESC::progress_callback2
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
Definition: ocrclass.h:108
PROGRESS_FUNC
bool(*)(int, int, int, int, int) PROGRESS_FUNC
Definition: ocrclass.h:92
ETEXT_DESC::more_to_come
int8_t more_to_come
percent complete increasing (0-100)
Definition: ocrclass.h:102
ETEXT_DESC
Definition: ocrclass.h:95
ETEXT_DESC::count
int16_t count
Definition: ocrclass.h:97
ETEXT_DESC::ocr_alive
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:103
ETEXT_DESC::cancel_this
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:109
ETEXT_DESC::progress
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:98
tesstrain_utils.int
int
Definition: tesstrain_utils.py:154
ETEXT_DESC::err_code
int8_t err_code
ocr sets to 1, HP 0
Definition: ocrclass.h:104
ETEXT_DESC::deadline_exceeded
bool deadline_exceeded() const
Definition: ocrclass.h:129
ETEXT_DESC::ETEXT_DESC
ETEXT_DESC()
character data
Definition: ocrclass.h:115
ETEXT_DESC::progress_callback
PROGRESS_FUNC progress_callback
returns true to cancel
Definition: ocrclass.h:106
CANCEL_FUNC
bool(*)(void *, int) CANCEL_FUNC
Definition: ocrclass.h:91
ETEXT_DESC::end_time
std::chrono::steady_clock::time_point end_time
this or other data for cancel
Definition: ocrclass.h:110
ETEXT_DESC::cancel
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:105
ETEXT_DESC::text
EANYCODE_CHAR text[1]
Definition: ocrclass.h:113
EANYCODE_CHAR
Definition: ocrclass.h:53
PROGRESS_FUNC2
bool(*)(ETEXT_DESC *, int, int, int, int) PROGRESS_FUNC2
Definition: ocrclass.h:93