tesseract  5.0.0-alpha-619-ge9db
altorenderer.cpp
Go to the documentation of this file.
1 // File: altorenderer.cpp
2 // Description: ALTO rendering interface
3 // Author: Jake Sebright
4 
5 // (C) Copyright 2018
6 // Licensed under the Apache License, Version 2.0 (the "License");
7 // you may not use this file except in compliance with the License.
8 // You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #include <memory>
17 #include <sstream> // for std::stringstream
18 #include <tesseract/baseapi.h>
19 #ifdef _WIN32
20 # include "host.h" // windows.h for MultiByteToWideChar, ...
21 #endif
22 #include <tesseract/renderer.h>
23 #include <tesseract/strngs.h> // for STRING
24 
25 namespace tesseract {
26 
30 static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level,
31  std::stringstream& alto_str) {
32  int left, top, right, bottom;
33  it->BoundingBox(level, &left, &top, &right, &bottom);
34 
35  int hpos = left;
36  int vpos = top;
37  int height = bottom - top;
38  int width = right - left;
39 
40  alto_str << " HPOS=\"" << hpos << "\"";
41  alto_str << " VPOS=\"" << vpos << "\"";
42  alto_str << " WIDTH=\"" << width << "\"";
43  alto_str << " HEIGHT=\"" << height << "\"";
44 
45  if (level == RIL_WORD) {
46  int wc = it->Confidence(RIL_WORD);
47  alto_str << " WC=\"0." << wc << "\"";
48  } else {
49  alto_str << ">";
50  }
51 }
52 
58  "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
59  "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
60  "xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
61  "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
62  "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
63  "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
64  "\t<Description>\n"
65  "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
66  "\t\t<sourceImageInformation>\n"
67  "\t\t\t<fileName>");
68 
70 
72  "</fileName>\n"
73  "\t\t</sourceImageInformation>\n"
74  "\t\t<OCRProcessing ID=\"OCR_0\">\n"
75  "\t\t\t<ocrProcessingStep>\n"
76  "\t\t\t\t<processingSoftware>\n"
77  "\t\t\t\t\t<softwareName>tesseract ");
80  "</softwareName>\n"
81  "\t\t\t\t</processingSoftware>\n"
82  "\t\t\t</ocrProcessingStep>\n"
83  "\t\t</OCRProcessing>\n"
84  "\t</Description>\n"
85  "\t<Layout>\n");
86 
87  return true;
88 }
89 
94  const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
95  if (text == nullptr) return false;
96 
97  AppendString(text.get());
98 
99  return true;
100 }
101 
106  AppendString("\t</Layout>\n</alto>\n");
107 
108  return true;
109 }
110 
111 TessAltoRenderer::TessAltoRenderer(const char* outputbase)
112  : TessResultRenderer(outputbase, "xml") {}
113 
118 char* TessBaseAPI::GetAltoText(int page_number) {
119  return GetAltoText(nullptr, page_number);
120 }
121 
126 char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
127  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
128  return nullptr;
129 
130  int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
131 
132  if (input_file_ == nullptr) SetInputName(nullptr);
133 
134 #ifdef _WIN32
135  // convert input name from ANSI encoding to utf-8
136  int str16_len =
137  MultiByteToWideChar(CP_ACP, 0, input_file_->c_str(), -1, nullptr, 0);
138  wchar_t* uni16_str = new WCHAR[str16_len];
139  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->c_str(), -1,
140  uni16_str, str16_len);
141  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
142  0, nullptr, nullptr);
143  char* utf8_str = new char[utf8_len];
144  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
145  nullptr, nullptr);
146  *input_file_ = utf8_str;
147  delete[] uni16_str;
148  delete[] utf8_str;
149 #endif
150 
151  std::stringstream alto_str;
152  // Use "C" locale (needed for int values larger than 999).
153  alto_str.imbue(std::locale::classic());
154  alto_str
155  << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\""
156  << rect_height_
157  << "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
158  << " ID=\"page_" << page_number << "\">\n"
159  << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
160  << " WIDTH=\"" << rect_width_ << "\""
161  << " HEIGHT=\"" << rect_height_ << "\">\n";
162 
163  ResultIterator* res_it = GetIterator();
164  while (!res_it->Empty(RIL_BLOCK)) {
165  if (res_it->Empty(RIL_WORD)) {
166  res_it->Next(RIL_WORD);
167  continue;
168  }
169 
170  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
171  alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
172  AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
173  alto_str << "\n";
174  }
175 
176  if (res_it->IsAtBeginningOf(RIL_PARA)) {
177  alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
178  AddBoxToAlto(res_it, RIL_PARA, alto_str);
179  alto_str << "\n";
180  }
181 
182  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
183  alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
184  AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
185  alto_str << "\n";
186  }
187 
188  alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
189  AddBoxToAlto(res_it, RIL_WORD, alto_str);
190  alto_str << " CONTENT=\"";
191 
192  bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
193  bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
194  bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
195 
196 
197  int left, top, right, bottom;
198  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
199 
200  do {
201  const std::unique_ptr<const char[]> grapheme(
202  res_it->GetUTF8Text(RIL_SYMBOL));
203  if (grapheme && grapheme[0] != 0) {
204  alto_str << HOcrEscape(grapheme.get()).c_str();
205  }
206  res_it->Next(RIL_SYMBOL);
207  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
208 
209  alto_str << "\"/>";
210 
211  wcnt++;
212 
213  if (last_word_in_line) {
214  alto_str << "\n\t\t\t\t\t\t</TextLine>\n";
215  lcnt++;
216  } else {
217  int hpos = right;
218  int vpos = top;
219  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
220  int width = left - hpos;
221  alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos
222  << "\" HPOS=\"" << hpos << "\"/>\n";
223  }
224 
225  if (last_word_in_tblock) {
226  alto_str << "\t\t\t\t\t</TextBlock>\n";
227  tcnt++;
228  }
229 
230  if (last_word_in_cblock) {
231  alto_str << "\t\t\t\t</ComposedBlock>\n";
232  bcnt++;
233  }
234  }
235 
236  alto_str << "\t\t\t</PrintSpace>\n"
237  << "\t\t</Page>\n";
238  const std::string& text = alto_str.str();
239 
240  char* result = new char[text.length() + 1];
241  strcpy(result, text.c_str());
242  delete res_it;
243  return result;
244 }
245 
246 } // namespace tesseract
string
std::string string
Definition: equationdetect_test.cc:21
strngs.h
tesseract::ResultIterator::IsAtFinalElement
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
Definition: resultiterator.cpp:570
tesseract::RIL_WORD
Definition: publictypes.h:220
tesseract::TessBaseAPI::page_res_
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:893
host.h
tesseract::RIL_BLOCK
Definition: publictypes.h:217
tesseract::ResultIterator::IsAtBeginningOf
bool IsAtBeginningOf(PageIteratorLevel level) const override
Definition: resultiterator.cpp:527
tesseract::TessBaseAPI::rect_height_
int rect_height_
Definition: baseapi.h:910
ETEXT_DESC
Definition: ocrclass.h:95
tesseract::TessResultRenderer::AppendString
void AppendString(const char *s)
Definition: renderer.cpp:101
tesseract::TessAltoRenderer::TessAltoRenderer
TessAltoRenderer(const char *outputbase)
Definition: altorenderer.cpp:111
tesseract::RIL_SYMBOL
Definition: publictypes.h:221
tesseract::TessAltoRenderer::EndDocumentHandler
bool EndDocumentHandler() override
Definition: altorenderer.cpp:105
tesseract::TessBaseAPI::rect_width_
int rect_width_
Definition: baseapi.h:909
baseapi.h
tesseract::PageIterator::Empty
bool Empty(PageIteratorLevel level) const
Definition: pageiterator.cpp:349
tesseract::TessAltoRenderer::AddImageHandler
bool AddImageHandler(TessBaseAPI *api) override
Definition: altorenderer.cpp:93
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::TessBaseAPI::tesseract_
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:886
tesseract::TessBaseAPI::GetIterator
ResultIterator * GetIterator()
Definition: baseapi.cpp:1321
tesseract::TessBaseAPI
Definition: baseapi.h:98
tesseract::TessBaseAPI::input_file_
STRING * input_file_
Name used by training code.
Definition: baseapi.h:894
tesseract::TessAltoRenderer::BeginDocumentHandler
bool BeginDocumentHandler() override
Definition: altorenderer.cpp:56
tesseract::PageIteratorLevel
PageIteratorLevel
Definition: publictypes.h:216
tesseract::TessBaseAPI::Version
static const char * Version()
Definition: baseapi.cpp:233
tesseract
Definition: baseapi.h:65
tesseract::TessBaseAPI::SetInputName
void SetInputName(const char *name)
Definition: baseapi.cpp:262
tesseract::RIL_TEXTLINE
Definition: publictypes.h:219
tesseract::ResultIterator::Next
bool Next(PageIteratorLevel level) override
Definition: resultiterator.cpp:449
tesseract::ResultIterator
Definition: resultiterator.h:44
tesseract::PageIterator::BoundingBox
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
Definition: pageiterator.cpp:325
tesseract::TessResultRenderer::title
const char * title() const
Definition: renderer.h:89
renderer.h
tesseract::TessResultRenderer
Definition: renderer.h:49
tesseract::ResultIterator::GetUTF8Text
virtual char * GetUTF8Text(PageIteratorLevel level) const
Definition: resultiterator.cpp:603
tesseract::TessResultRenderer::imagenum
int imagenum() const
Definition: renderer.h:107
tesseract::TessBaseAPI::GetAltoText
char * GetAltoText(ETEXT_DESC *monitor, int page_number)
Definition: altorenderer.cpp:126
tesseract::RIL_PARA
Definition: publictypes.h:218
tesseract::TessBaseAPI::Recognize
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:827
tesseract::HOcrEscape
STRING HOcrEscape(const char *text)
Definition: baseapi.cpp:2307