tesseract  5.0.0-alpha-619-ge9db
hocrrenderer.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: hocrrenderer.cpp
3  * Description: Simple API for calling tesseract.
4  * Author: Ray Smith (original code from baseapi.cpp)
5  * Author: Stefan Weil (moved to separate file and cleaned code)
6  *
7  * (C) Copyright 2006, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <locale> // for std::locale::classic
21 #include <memory> // for std::unique_ptr
22 #include <sstream> // for std::stringstream
23 #include <tesseract/baseapi.h> // for TessBaseAPI
24 #ifdef _WIN32
25 # include "host.h" // windows.h for MultiByteToWideChar, ...
26 #endif
27 #include <tesseract/renderer.h>
28 #include "tesseractclass.h" // for Tesseract
29 
30 namespace tesseract {
31 
35 static tesseract::Orientation GetBlockTextOrientation(const PageIterator* it) {
36  tesseract::Orientation orientation;
37  tesseract::WritingDirection writing_direction;
38  tesseract::TextlineOrder textline_order;
39  float deskew_angle;
40  it->Orientation(&orientation, &writing_direction, &textline_order,
41  &deskew_angle);
42  return orientation;
43 }
44 
53 static void AddBaselineCoordsTohOCR(const PageIterator* it,
54  PageIteratorLevel level,
55  std::stringstream& hocr_str) {
56  tesseract::Orientation orientation = GetBlockTextOrientation(it);
57  if (orientation != ORIENTATION_PAGE_UP) {
58  hocr_str << "; textangle " << 360 - orientation * 90;
59  return;
60  }
61 
62  int left, top, right, bottom;
63  it->BoundingBox(level, &left, &top, &right, &bottom);
64 
65  // Try to get the baseline coordinates at this level.
66  int x1, y1, x2, y2;
67  if (!it->Baseline(level, &x1, &y1, &x2, &y2)) return;
68  // Following the description of this field of the hOCR spec, we convert the
69  // baseline coordinates so that "the bottom left of the bounding box is the
70  // origin".
71  x1 -= left;
72  x2 -= left;
73  y1 -= bottom;
74  y2 -= bottom;
75 
76  // Now fit a line through the points so we can extract coefficients for the
77  // equation: y = p1 x + p0
78  if (x1 == x2) {
79  // Problem computing the polynomial coefficients.
80  return;
81  }
82  double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
83  double p0 = y1 - p1 * x1;
84 
85  hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " "
86  << round(p0 * 1000.0) / 1000.0;
87 }
88 
89 static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
90  std::stringstream& hocr_str) {
91  int left, top, right, bottom;
92  it->BoundingBox(level, &left, &top, &right, &bottom);
93  // This is the only place we use double quotes instead of single quotes,
94  // but it may too late to change for consistency
95  hocr_str << " title=\"bbox " << left << " " << top << " " << right << " "
96  << bottom;
97  // Add baseline coordinates & heights for textlines only.
98  if (level == RIL_TEXTLINE) {
99  AddBaselineCoordsTohOCR(it, level, hocr_str);
100  // add custom height measures
101  float row_height, descenders, ascenders; // row attributes
102  it->RowAttributes(&row_height, &descenders, &ascenders);
103  // TODO(rays): Do we want to limit these to a single decimal place?
104  hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders
105  << "; x_ascenders " << ascenders;
106  }
107  hocr_str << "\">";
108 }
109 
119 char* TessBaseAPI::GetHOCRText(int page_number) {
120  return GetHOCRText(nullptr, page_number);
121 }
122 
132 char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
133  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
134  return nullptr;
135 
136  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
137  int page_id = page_number + 1; // hOCR uses 1-based page numbers.
138  bool para_is_ltr = true; // Default direction is LTR
139  const char* paragraph_lang = nullptr;
140  bool font_info = false;
141  bool hocr_boxes = false;
142  GetBoolVariable("hocr_font_info", &font_info);
143  GetBoolVariable("hocr_char_boxes", &hocr_boxes);
144 
145  if (input_file_ == nullptr) SetInputName(nullptr);
146 
147 #ifdef _WIN32
148  // convert input name from ANSI encoding to utf-8
149  int str16_len =
150  MultiByteToWideChar(CP_ACP, 0, input_file_->c_str(), -1, nullptr, 0);
151  wchar_t* uni16_str = new WCHAR[str16_len];
152  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->c_str(), -1,
153  uni16_str, str16_len);
154  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
155  0, nullptr, nullptr);
156  char* utf8_str = new char[utf8_len];
157  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
158  nullptr, nullptr);
159  *input_file_ = utf8_str;
160  delete[] uni16_str;
161  delete[] utf8_str;
162 #endif
163 
164  std::stringstream hocr_str;
165  // Use "C" locale (needed for double values x_size and x_descenders).
166  hocr_str.imbue(std::locale::classic());
167  // Use 8 digits for double values.
168  hocr_str.precision(8);
169  hocr_str << " <div class='ocr_page'";
170  hocr_str << " id='"
171  << "page_" << page_id << "'";
172  hocr_str << " title='image \"";
173  if (input_file_) {
174  hocr_str << HOcrEscape(input_file_->c_str()).c_str();
175  } else {
176  hocr_str << "unknown";
177  }
178  hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " "
179  << rect_width_ << " " << rect_height_ << "; ppageno " << page_number
180  << "'>\n";
181 
182  std::unique_ptr<ResultIterator> res_it(GetIterator());
183  while (!res_it->Empty(RIL_BLOCK)) {
184  if (res_it->Empty(RIL_WORD)) {
185  res_it->Next(RIL_WORD);
186  continue;
187  }
188 
189  // Open any new block/paragraph/textline.
190  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
191  para_is_ltr = true; // reset to default direction
192  hocr_str << " <div class='ocr_carea'"
193  << " id='"
194  << "block_" << page_id << "_" << bcnt << "'";
195  AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
196  }
197  if (res_it->IsAtBeginningOf(RIL_PARA)) {
198  hocr_str << "\n <p class='ocr_par'";
199  para_is_ltr = res_it->ParagraphIsLtr();
200  if (!para_is_ltr) {
201  hocr_str << " dir='rtl'";
202  }
203  hocr_str << " id='"
204  << "par_" << page_id << "_" << pcnt << "'";
205  paragraph_lang = res_it->WordRecognitionLanguage();
206  if (paragraph_lang) {
207  hocr_str << " lang='" << paragraph_lang << "'";
208  }
209  AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
210  }
211  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
212  hocr_str << "\n <span class='";
213  switch (res_it->BlockType()) {
214  case PT_HEADING_TEXT:
215  hocr_str << "ocr_header";
216  break;
217  case PT_PULLOUT_TEXT:
218  hocr_str << "ocr_textfloat";
219  break;
220  case PT_CAPTION_TEXT:
221  hocr_str << "ocr_caption";
222  break;
223  default:
224  hocr_str << "ocr_line";
225  }
226  hocr_str << "' id='"
227  << "line_" << page_id << "_" << lcnt << "'";
228  AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
229  }
230 
231  // Now, process the word...
232  int32_t lstm_choice_mode = tesseract_->lstm_choice_mode;
233  std::vector<std::vector<std::vector<std::pair<const char*, float>>>>* rawTimestepMap =
234  nullptr;
235  std::vector<std::vector<std::pair<const char*, float>>>* CTCMap =
236  nullptr;
237  if (lstm_choice_mode) {
238 
239  CTCMap = res_it->GetBestLSTMSymbolChoices();
240  rawTimestepMap = res_it->GetRawLSTMTimesteps();
241  }
242  hocr_str << "\n <span class='ocrx_word'"
243  << " id='"
244  << "word_" << page_id << "_" << wcnt << "'";
245  int left, top, right, bottom;
246  bool bold, italic, underlined, monospace, serif, smallcaps;
247  int pointsize, font_id;
248  const char* font_name;
249  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
250  font_name =
251  res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
252  &serif, &smallcaps, &pointsize, &font_id);
253  hocr_str << " title='bbox " << left << " " << top << " " << right << " "
254  << bottom << "; x_wconf "
255  << static_cast<int>(res_it->Confidence(RIL_WORD));
256  if (font_info) {
257  if (font_name) {
258  hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
259  }
260  hocr_str << "; x_fsize " << pointsize;
261  }
262  hocr_str << "'";
263  const char* lang = res_it->WordRecognitionLanguage();
264  if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
265  hocr_str << " lang='" << lang << "'";
266  }
267  switch (res_it->WordDirection()) {
268  // Only emit direction if different from current paragraph direction
269  case DIR_LEFT_TO_RIGHT:
270  if (!para_is_ltr) hocr_str << " dir='ltr'";
271  break;
272  case DIR_RIGHT_TO_LEFT:
273  if (para_is_ltr) hocr_str << " dir='rtl'";
274  break;
275  case DIR_MIX:
276  case DIR_NEUTRAL:
277  default: // Do nothing.
278  break;
279  }
280  hocr_str << ">";
281  bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
282  bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
283  bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
284  if (bold) hocr_str << "<strong>";
285  if (italic) hocr_str << "<em>";
286  do {
287  const std::unique_ptr<const char[]> grapheme(
288  res_it->GetUTF8Text(RIL_SYMBOL));
289  if (grapheme && grapheme[0] != 0) {
290  if (hocr_boxes) {
291  res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
292  hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
293  << left << " " << top << " " << right << " " << bottom
294  << "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
295  }
296  hocr_str << HOcrEscape(grapheme.get()).c_str();
297  if (hocr_boxes) {
298  hocr_str << "</span>";
299  tesseract::ChoiceIterator ci(*res_it);
300  if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) {
301  std::vector<std::vector<std::pair<const char*, float>>>* symbol =
302  ci.Timesteps();
303  hocr_str << "\n <span class='ocr_symbol'"
304  << " id='"
305  << "symbol_" << page_id << "_" << wcnt << "_" << scnt
306  << "'>";
307  for (auto timestep : *symbol) {
308  hocr_str << "\n <span class='ocrx_cinfo'"
309  << " id='"
310  << "timestep" << page_id << "_" << wcnt << "_" << tcnt
311  << "'>";
312  for (auto conf : timestep) {
313  hocr_str << "\n <span class='ocrx_cinfo'"
314  << " id='"
315  << "choice_" << page_id << "_" << wcnt << "_" << ccnt
316  << "'"
317  << " title='x_confs " << int(conf.second * 100)
318  << "'>" << HOcrEscape(conf.first).c_str()
319  << "</span>";
320  ++ccnt;
321  }
322  hocr_str << "</span>";
323  ++tcnt;
324  }
325  hocr_str << "\n </span>";
326  ++scnt;
327  } else if (lstm_choice_mode == 2) {
328  tesseract::ChoiceIterator ci(*res_it);
329  hocr_str << "\n <span class='ocrx_cinfo'"
330  << " id='"
331  << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
332  << "'>";
333  do {
334  const char* choice = ci.GetUTF8Text();
335  float choiceconf = ci.Confidence();
336  if (choice != nullptr) {
337  hocr_str << "\n <span class='ocrx_cinfo'"
338  << " id='"
339  << "choice_" << page_id << "_" << wcnt << "_" << ccnt
340  << "'"
341  << " title='x_confs " << choiceconf << "'>"
342  << HOcrEscape(choice).c_str() << "</span>";
343  ccnt++;
344  }
345  } while (ci.Next());
346  hocr_str << "\n </span>";
347  tcnt++;
348  }
349  }
350  }
351  res_it->Next(RIL_SYMBOL);
352  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
353  if (italic) hocr_str << "</em>";
354  if (bold) hocr_str << "</strong>";
355  // If the lstm choice mode is required it is added here
356  if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) {
357  for (auto symbol : *rawTimestepMap) {
358  hocr_str << "\n <span class='ocr_symbol'"
359  << " id='"
360  << "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
361  for (auto timestep : symbol) {
362  hocr_str << "\n <span class='ocrx_cinfo'"
363  << " id='"
364  << "timestep" << page_id << "_" << wcnt << "_" << tcnt
365  << "'>";
366  for (auto conf : timestep) {
367  hocr_str << "\n <span class='ocrx_cinfo'"
368  << " id='"
369  << "choice_" << page_id << "_" << wcnt << "_" << ccnt
370  << "'"
371  << " title='x_confs " << int(conf.second * 100) << "'>"
372  << HOcrEscape(conf.first).c_str() << "</span>";
373  ++ccnt;
374  }
375  hocr_str << "</span>";
376  ++tcnt;
377  }
378  hocr_str << "</span>";
379  ++scnt;
380  }
381  } else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) {
382  for (auto timestep : *CTCMap) {
383  if (timestep.size() > 0) {
384  hocr_str << "\n <span class='ocrx_cinfo'"
385  << " id='"
386  << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
387  << "'>";
388  for (auto& j : timestep) {
389  float conf = 100 - tesseract_->lstm_rating_coefficient * j.second;
390  if (conf < 0.0f)
391  conf = 0.0f;
392  if (conf > 100.0f)
393  conf = 100.0f;
394  hocr_str << "\n <span class='ocrx_cinfo'"
395  << " id='"
396  << "choice_" << page_id << "_" << wcnt << "_" << ccnt
397  << "'"
398  << " title='x_confs " << conf << "'>"
399  << HOcrEscape(j.first).c_str() << "</span>";
400  ccnt++;
401  }
402  hocr_str << "</span>";
403  tcnt++;
404  }
405  }
406  }
407  // Close ocrx_word.
408  if (hocr_boxes || lstm_choice_mode > 0) {
409  hocr_str << "\n ";
410  }
411  hocr_str << "</span>";
412  tcnt = 1;
413  ccnt = 1;
414  wcnt++;
415  // Close any ending block/paragraph/textline.
416  if (last_word_in_line) {
417  hocr_str << "\n </span>";
418  lcnt++;
419  }
420  if (last_word_in_para) {
421  hocr_str << "\n </p>\n";
422  pcnt++;
423  para_is_ltr = true; // back to default direction
424  }
425  if (last_word_in_block) {
426  hocr_str << " </div>\n";
427  bcnt++;
428  }
429  }
430  hocr_str << " </div>\n";
431 
432  const std::string& text = hocr_str.str();
433  char* result = new char[text.length() + 1];
434  strcpy(result, text.c_str());
435  return result;
436 }
437 
438 /**********************************************************************
439  * HOcr Text Renderer interface implementation
440  **********************************************************************/
441 TessHOcrRenderer::TessHOcrRenderer(const char* outputbase)
442  : TessResultRenderer(outputbase, "hocr") {
443  font_info_ = false;
444 }
445 
446 TessHOcrRenderer::TessHOcrRenderer(const char* outputbase, bool font_info)
447  : TessResultRenderer(outputbase, "hocr") {
448  font_info_ = font_info;
449 }
450 
452  AppendString(
453  "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
454  "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
455  " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
456  "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
457  "lang=\"en\">\n <head>\n <title>");
458  AppendString(title());
459  AppendString(
460  "</title>\n"
461  " <meta http-equiv=\"Content-Type\" content=\"text/html;"
462  "charset=utf-8\"/>\n"
463  " <meta name='ocr-system' content='tesseract " PACKAGE_VERSION
464  "' />\n"
465  " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
466  " ocr_line ocrx_word ocrp_wconf");
467  if (font_info_) AppendString(" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
468  AppendString(
469  "'/>\n"
470  " </head>\n"
471  " <body>\n");
472 
473  return true;
474 }
475 
477  AppendString(" </body>\n</html>\n");
478 
479  return true;
480 }
481 
483  const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
484  if (hocr == nullptr) return false;
485 
486  AppendString(hocr.get());
487 
488  return true;
489 }
490 
491 } // namespace tesseract
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::RIL_WORD
Definition: publictypes.h:220
tesseract::TessBaseAPI::page_res_
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:893
host.h
tesseract::Tesseract::lstm_choice_mode
int lstm_choice_mode
Definition: tesseractclass.h:1086
tesseractclass.h
tesseract::Tesseract::lstm_rating_coefficient
double lstm_rating_coefficient
Definition: tesseractclass.h:1095
tesseract::RIL_BLOCK
Definition: publictypes.h:217
tesseract::ChoiceIterator
Definition: ltrresultiterator.h:186
tesseract::TessBaseAPI::rect_top_
int rect_top_
Definition: baseapi.h:908
tesseract::WritingDirection
WritingDirection
Definition: publictypes.h:131
PT_CAPTION_TEXT
Definition: capi.h:116
tesseract::TessBaseAPI::rect_height_
int rect_height_
Definition: baseapi.h:910
ETEXT_DESC
Definition: ocrclass.h:95
tesseract::TessResultRenderer::AppendString
void AppendString(const char *s)
Definition: renderer.cpp:101
DIR_LEFT_TO_RIGHT
Definition: unichar.h:45
tesseract::RIL_SYMBOL
Definition: publictypes.h:221
tesseract::TessHOcrRenderer::TessHOcrRenderer
TessHOcrRenderer(const char *outputbase, bool font_info)
Definition: hocrrenderer.cpp:460
tesseract::TessBaseAPI::rect_width_
int rect_width_
Definition: baseapi.h:909
tesseract::TextlineOrder
TextlineOrder
Definition: publictypes.h:148
baseapi.h
tesseract::TessHOcrRenderer::AddImageHandler
bool AddImageHandler(TessBaseAPI *api) override
Definition: hocrrenderer.cpp:496
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::TessHOcrRenderer::EndDocumentHandler
bool EndDocumentHandler() override
Definition: hocrrenderer.cpp:490
tesstrain_utils.int
int
Definition: tesstrain_utils.py:154
PT_HEADING_TEXT
Definition: capi.h:110
tesseract::TessHOcrRenderer::BeginDocumentHandler
bool BeginDocumentHandler() override
Definition: hocrrenderer.cpp:465
tesseract::TessBaseAPI::tesseract_
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:886
tesseract::TessBaseAPI::GetIterator
ResultIterator * GetIterator()
Definition: baseapi.cpp:1321
tesseract::TessBaseAPI::input_file_
STRING * input_file_
Name used by training code.
Definition: baseapi.h:894
tesseract::PageIteratorLevel
PageIteratorLevel
Definition: publictypes.h:216
tesseract
Definition: baseapi.h:65
tesseract::TessBaseAPI::SetInputName
void SetInputName(const char *name)
Definition: baseapi.cpp:262
tesseract::RIL_TEXTLINE
Definition: publictypes.h:219
DIR_NEUTRAL
Definition: unichar.h:44
tesseract::TessBaseAPI::rect_left_
int rect_left_
Definition: baseapi.h:907
tesseract::TessBaseAPI::GetBoolVariable
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:297
tesseract::ORIENTATION_PAGE_UP
Definition: publictypes.h:117
tesseract::TessResultRenderer::title
const char * title() const
Definition: renderer.h:89
renderer.h
DIR_RIGHT_TO_LEFT
Definition: unichar.h:46
DIR_MIX
Definition: unichar.h:47
PT_PULLOUT_TEXT
Definition: capi.h:111
tesseract::TessResultRenderer::imagenum
int imagenum() const
Definition: renderer.h:107
TessBaseAPI
struct TessBaseAPI TessBaseAPI
Definition: capi.h:72
tesseract::Orientation
Orientation
Definition: publictypes.h:116
TessResultRenderer
struct TessResultRenderer TessResultRenderer
Definition: capi.h:71
tesseract::RIL_PARA
Definition: publictypes.h:218
tesseract::TessBaseAPI::Recognize
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:827
tesseract::HOcrEscape
STRING HOcrEscape(const char *text)
Definition: baseapi.cpp:2307
tesseract::TessBaseAPI::GetHOCRText
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
Definition: hocrrenderer.cpp:147