tesseract  5.0.0-alpha-619-ge9db
wordstrboxrenderer.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: wordstrboxrenderer.cpp
3  * Description: Renderer for creating box file with WordStr strings.
4  * based on the tsv renderer.
5  *
6  * (C) Copyright 2019, Google Inc.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include <tesseract/baseapi.h> // for TessBaseAPI
20 #include <tesseract/renderer.h>
21 #include "tesseractclass.h" // for Tesseract
22 
23 namespace tesseract {
24 
31 char* TessBaseAPI::GetWordStrBoxText(int page_number=0) {
32  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
33  return nullptr;
34 
35  STRING wordstr_box_str("");
36  int left = 0, top = 0, right = 0, bottom = 0;
37 
38  bool first_line = true;
39 
40  LTRResultIterator* res_it = GetLTRIterator();
41  while (!res_it->Empty(RIL_BLOCK)) {
42  if (res_it->Empty(RIL_WORD)) {
43  res_it->Next(RIL_WORD);
44  continue;
45  }
46 
47  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
48  if (!first_line) {
49  wordstr_box_str.add_str_int("\n\t ", right + 1);
50  wordstr_box_str.add_str_int(" ", image_height_ - bottom);
51  wordstr_box_str.add_str_int(" ", right + 5);
52  wordstr_box_str.add_str_int(" ", image_height_ - top);
53  wordstr_box_str.add_str_int(" ", page_number); // row for tab for EOL
54  wordstr_box_str += "\n";
55  } else {
56  first_line = false;
57  }
58  // Use bounding box for whole line for WordStr
59  res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
60  wordstr_box_str.add_str_int("WordStr ", left);
61  wordstr_box_str.add_str_int(" ", image_height_ - bottom);
62  wordstr_box_str.add_str_int(" ", right);
63  wordstr_box_str.add_str_int(" ", image_height_ - top);
64  wordstr_box_str.add_str_int(" ", page_number); // word
65  wordstr_box_str += " #";
66  }
67  do {
68  wordstr_box_str +=
69  std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
70  wordstr_box_str += " ";
71  res_it->Next(RIL_WORD);
72  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
73  }
74 
75  if (left != 0 && top != 0 && right != 0 && bottom != 0) {
76  wordstr_box_str.add_str_int("\n\t ", right + 1);
77  wordstr_box_str.add_str_int(" ", image_height_ - bottom);
78  wordstr_box_str.add_str_int(" ", right + 5);
79  wordstr_box_str.add_str_int(" ", image_height_ - top);
80  wordstr_box_str.add_str_int(" ", page_number); // row for tab for EOL
81  wordstr_box_str += "\n";
82  }
83  char* ret = new char[wordstr_box_str.length() + 1];
84  strcpy(ret, wordstr_box_str.c_str());
85  delete res_it;
86  return ret;
87 }
88 
89 /**********************************************************************
90  * WordStrBox Renderer interface implementation
91  **********************************************************************/
93  : TessResultRenderer(outputbase, "box") {}
94 
96  const std::unique_ptr<const char[]> wordstrbox(
97  api->GetWordStrBoxText(imagenum()));
98  if (wordstrbox == nullptr) return false;
99 
100  AppendString(wordstrbox.get());
101 
102  return true;
103 }
104 
105 } // namespace tesseract.
tesseract::RIL_WORD
Definition: publictypes.h:220
tesseract::TessBaseAPI::page_res_
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:893
tesseract::TessBaseAPI::image_height_
int image_height_
Definition: baseapi.h:912
tesseract::TessWordStrBoxRenderer::TessWordStrBoxRenderer
TessWordStrBoxRenderer(const char *outputbase)
Definition: wordstrboxrenderer.cpp:105
tesseractclass.h
tesseract::RIL_BLOCK
Definition: publictypes.h:217
STRING
Definition: strngs.h:45
tesseract::TessResultRenderer::AppendString
void AppendString(const char *s)
Definition: renderer.cpp:101
baseapi.h
tesseract::TessBaseAPI::tesseract_
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:886
tesseract
Definition: baseapi.h:65
tesseract::RIL_TEXTLINE
Definition: publictypes.h:219
tesseract::TessBaseAPI::GetLTRIterator
TESS_LOCAL LTRResultIterator * GetLTRIterator()
Definition: baseapi.cpp:1304
renderer.h
tesseract::TessBaseAPI::GetWordStrBoxText
char * GetWordStrBoxText(int page_number)
Definition: wordstrboxrenderer.cpp:45
tesseract::TessResultRenderer::imagenum
int imagenum() const
Definition: renderer.h:107
TessBaseAPI
struct TessBaseAPI TessBaseAPI
Definition: capi.h:72
TessResultRenderer
struct TessResultRenderer TessResultRenderer
Definition: capi.h:71
tesseract::TessBaseAPI::Recognize
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:827
tesseract::TessWordStrBoxRenderer::AddImageHandler
bool AddImageHandler(TessBaseAPI *api) override
Definition: wordstrboxrenderer.cpp:108