40 it->Orientation(&orientation, &writing_direction, &textline_order,
53 static void AddBaselineCoordsTohOCR(
const PageIterator* it,
55 std::stringstream& hocr_str) {
58 hocr_str <<
"; textangle " << 360 - orientation * 90;
62 int left, top, right, bottom;
63 it->BoundingBox(level, &left, &top, &right, &bottom);
67 if (!it->Baseline(level, &x1, &y1, &x2, &y2))
return;
82 double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
83 double p0 = y1 - p1 * x1;
85 hocr_str <<
"; baseline " << round(p1 * 1000.0) / 1000.0 <<
" "
86 << round(p0 * 1000.0) / 1000.0;
90 std::stringstream& hocr_str) {
91 int left, top, right, bottom;
92 it->BoundingBox(level, &left, &top, &right, &bottom);
95 hocr_str <<
" title=\"bbox " << left <<
" " << top <<
" " << right <<
" "
99 AddBaselineCoordsTohOCR(it, level, hocr_str);
101 float row_height, descenders, ascenders;
102 it->RowAttributes(&row_height, &descenders, &ascenders);
104 hocr_str <<
"; x_size " << row_height <<
"; x_descenders " << -descenders
105 <<
"; x_ascenders " << ascenders;
136 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
137 int page_id = page_number + 1;
138 bool para_is_ltr =
true;
139 const char* paragraph_lang =
nullptr;
140 bool font_info =
false;
141 bool hocr_boxes =
false;
151 wchar_t* uni16_str =
new WCHAR[str16_len];
153 uni16_str, str16_len);
154 int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len,
nullptr,
155 0,
nullptr,
nullptr);
156 char* utf8_str =
new char[utf8_len];
157 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
164 std::stringstream hocr_str;
166 hocr_str.imbue(std::locale::classic());
168 hocr_str.precision(8);
169 hocr_str <<
" <div class='ocr_page'";
171 <<
"page_" << page_id <<
"'";
172 hocr_str <<
" title='image \"";
176 hocr_str <<
"unknown";
182 std::unique_ptr<ResultIterator> res_it(
GetIterator());
190 if (res_it->IsAtBeginningOf(
RIL_BLOCK)) {
192 hocr_str <<
" <div class='ocr_carea'"
194 <<
"block_" << page_id <<
"_" << bcnt <<
"'";
195 AddBoxTohOCR(res_it.get(),
RIL_BLOCK, hocr_str);
197 if (res_it->IsAtBeginningOf(
RIL_PARA)) {
198 hocr_str <<
"\n <p class='ocr_par'";
199 para_is_ltr = res_it->ParagraphIsLtr();
201 hocr_str <<
" dir='rtl'";
204 <<
"par_" << page_id <<
"_" << pcnt <<
"'";
205 paragraph_lang = res_it->WordRecognitionLanguage();
206 if (paragraph_lang) {
207 hocr_str <<
" lang='" << paragraph_lang <<
"'";
209 AddBoxTohOCR(res_it.get(),
RIL_PARA, hocr_str);
212 hocr_str <<
"\n <span class='";
213 switch (res_it->BlockType()) {
215 hocr_str <<
"ocr_header";
218 hocr_str <<
"ocr_textfloat";
221 hocr_str <<
"ocr_caption";
224 hocr_str <<
"ocr_line";
227 <<
"line_" << page_id <<
"_" << lcnt <<
"'";
233 std::vector<std::vector<std::vector<std::pair<const char*, float>>>>* rawTimestepMap =
235 std::vector<std::vector<std::pair<const char*, float>>>* CTCMap =
237 if (lstm_choice_mode) {
239 CTCMap = res_it->GetBestLSTMSymbolChoices();
240 rawTimestepMap = res_it->GetRawLSTMTimesteps();
242 hocr_str <<
"\n <span class='ocrx_word'"
244 <<
"word_" << page_id <<
"_" << wcnt <<
"'";
245 int left, top, right, bottom;
246 bool bold, italic, underlined, monospace, serif, smallcaps;
247 int pointsize, font_id;
248 const char* font_name;
249 res_it->BoundingBox(
RIL_WORD, &left, &top, &right, &bottom);
251 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
252 &serif, &smallcaps, &pointsize, &font_id);
253 hocr_str <<
" title='bbox " << left <<
" " << top <<
" " << right <<
" "
254 << bottom <<
"; x_wconf "
255 << static_cast<int>(res_it->Confidence(
RIL_WORD));
260 hocr_str <<
"; x_fsize " << pointsize;
263 const char* lang = res_it->WordRecognitionLanguage();
264 if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
265 hocr_str <<
" lang='" << lang <<
"'";
267 switch (res_it->WordDirection()) {
270 if (!para_is_ltr) hocr_str <<
" dir='ltr'";
273 if (para_is_ltr) hocr_str <<
" dir='rtl'";
284 if (bold) hocr_str <<
"<strong>";
285 if (italic) hocr_str <<
"<em>";
287 const std::unique_ptr<const char[]> grapheme(
289 if (grapheme && grapheme[0] != 0) {
291 res_it->BoundingBox(
RIL_SYMBOL, &left, &top, &right, &bottom);
292 hocr_str <<
"\n <span class='ocrx_cinfo' title='x_bboxes "
293 << left <<
" " << top <<
" " << right <<
" " << bottom
294 <<
"; x_conf " << res_it->Confidence(
RIL_SYMBOL) <<
"'>";
296 hocr_str <<
HOcrEscape(grapheme.get()).c_str();
298 hocr_str <<
"</span>";
300 if (lstm_choice_mode == 1 && ci.Timesteps() !=
nullptr) {
301 std::vector<std::vector<std::pair<const char*, float>>>* symbol =
303 hocr_str <<
"\n <span class='ocr_symbol'"
305 <<
"symbol_" << page_id <<
"_" << wcnt <<
"_" << scnt
307 for (
auto timestep : *symbol) {
308 hocr_str <<
"\n <span class='ocrx_cinfo'"
310 <<
"timestep" << page_id <<
"_" << wcnt <<
"_" << tcnt
312 for (
auto conf : timestep) {
313 hocr_str <<
"\n <span class='ocrx_cinfo'"
315 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << ccnt
317 <<
" title='x_confs " <<
int(conf.second * 100)
322 hocr_str <<
"</span>";
325 hocr_str <<
"\n </span>";
327 }
else if (lstm_choice_mode == 2) {
329 hocr_str <<
"\n <span class='ocrx_cinfo'"
331 <<
"lstm_choices_" << page_id <<
"_" << wcnt <<
"_" << tcnt
334 const char* choice = ci.GetUTF8Text();
335 float choiceconf = ci.Confidence();
336 if (choice !=
nullptr) {
337 hocr_str <<
"\n <span class='ocrx_cinfo'"
339 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << ccnt
341 <<
" title='x_confs " << choiceconf <<
"'>"
346 hocr_str <<
"\n </span>";
353 if (italic) hocr_str <<
"</em>";
354 if (bold) hocr_str <<
"</strong>";
356 if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap !=
nullptr) {
357 for (
auto symbol : *rawTimestepMap) {
358 hocr_str <<
"\n <span class='ocr_symbol'"
360 <<
"symbol_" << page_id <<
"_" << wcnt <<
"_" << scnt <<
"'>";
361 for (
auto timestep : symbol) {
362 hocr_str <<
"\n <span class='ocrx_cinfo'"
364 <<
"timestep" << page_id <<
"_" << wcnt <<
"_" << tcnt
366 for (
auto conf : timestep) {
367 hocr_str <<
"\n <span class='ocrx_cinfo'"
369 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << ccnt
371 <<
" title='x_confs " <<
int(conf.second * 100) <<
"'>"
375 hocr_str <<
"</span>";
378 hocr_str <<
"</span>";
381 }
else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap !=
nullptr) {
382 for (
auto timestep : *CTCMap) {
383 if (timestep.size() > 0) {
384 hocr_str <<
"\n <span class='ocrx_cinfo'"
386 <<
"lstm_choices_" << page_id <<
"_" << wcnt <<
"_" << tcnt
388 for (
auto& j : timestep) {
394 hocr_str <<
"\n <span class='ocrx_cinfo'"
396 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << ccnt
398 <<
" title='x_confs " << conf <<
"'>"
402 hocr_str <<
"</span>";
408 if (hocr_boxes || lstm_choice_mode > 0) {
411 hocr_str <<
"</span>";
416 if (last_word_in_line) {
417 hocr_str <<
"\n </span>";
420 if (last_word_in_para) {
421 hocr_str <<
"\n </p>\n";
425 if (last_word_in_block) {
426 hocr_str <<
" </div>\n";
430 hocr_str <<
" </div>\n";
433 char* result =
new char[text.length() + 1];
434 strcpy(result, text.c_str());
448 font_info_ = font_info;
453 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
454 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
455 " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
456 "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
457 "lang=\"en\">\n <head>\n <title>");
461 " <meta http-equiv=\"Content-Type\" content=\"text/html;"
462 "charset=utf-8\"/>\n"
463 " <meta name='ocr-system' content='tesseract " PACKAGE_VERSION
465 " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
466 " ocr_line ocrx_word ocrp_wconf");
467 if (font_info_)
AppendString(
" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
483 const std::unique_ptr<const char[]> hocr(api->GetHOCRText(
imagenum()));
484 if (hocr ==
nullptr)
return false;