31 std::stringstream& alto_str) {
32 int left, top, right, bottom;
33 it->BoundingBox(level, &left, &top, &right, &bottom);
37 int height = bottom - top;
38 int width = right - left;
40 alto_str <<
" HPOS=\"" << hpos <<
"\"";
41 alto_str <<
" VPOS=\"" << vpos <<
"\"";
42 alto_str <<
" WIDTH=\"" << width <<
"\"";
43 alto_str <<
" HEIGHT=\"" << height <<
"\"";
47 alto_str <<
" WC=\"0." << wc <<
"\"";
58 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
59 "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
60 "xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
61 "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
62 "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
63 "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
65 "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
66 "\t\t<sourceImageInformation>\n"
73 "\t\t</sourceImageInformation>\n"
74 "\t\t<OCRProcessing ID=\"OCR_0\">\n"
75 "\t\t\t<ocrProcessingStep>\n"
76 "\t\t\t\t<processingSoftware>\n"
77 "\t\t\t\t\t<softwareName>tesseract ");
81 "\t\t\t\t</processingSoftware>\n"
82 "\t\t\t</ocrProcessingStep>\n"
83 "\t\t</OCRProcessing>\n"
95 if (text ==
nullptr)
return false;
130 int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
138 wchar_t* uni16_str =
new WCHAR[str16_len];
140 uni16_str, str16_len);
141 int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len,
nullptr,
142 0,
nullptr,
nullptr);
143 char* utf8_str =
new char[utf8_len];
144 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
151 std::stringstream alto_str;
153 alto_str.imbue(std::locale::classic());
155 <<
"\t\t<Page WIDTH=\"" <<
rect_width_ <<
"\" HEIGHT=\""
157 <<
"\" PHYSICAL_IMG_NR=\"" << page_number <<
"\""
158 <<
" ID=\"page_" << page_number <<
"\">\n"
159 <<
"\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
171 alto_str <<
"\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt <<
"\"";
172 AddBoxToAlto(res_it,
RIL_BLOCK, alto_str);
177 alto_str <<
"\t\t\t\t\t<TextBlock ID=\"block_" << tcnt <<
"\"";
178 AddBoxToAlto(res_it,
RIL_PARA, alto_str);
183 alto_str <<
"\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt <<
"\"";
188 alto_str <<
"\t\t\t\t\t\t\t<String ID=\"string_" << wcnt <<
"\"";
189 AddBoxToAlto(res_it,
RIL_WORD, alto_str);
190 alto_str <<
" CONTENT=\"";
197 int left, top, right, bottom;
201 const std::unique_ptr<const char[]> grapheme(
203 if (grapheme && grapheme[0] != 0) {
204 alto_str <<
HOcrEscape(grapheme.get()).c_str();
213 if (last_word_in_line) {
214 alto_str <<
"\n\t\t\t\t\t\t</TextLine>\n";
220 int width = left - hpos;
221 alto_str <<
"<SP WIDTH=\"" << width <<
"\" VPOS=\"" << vpos
222 <<
"\" HPOS=\"" << hpos <<
"\"/>\n";
225 if (last_word_in_tblock) {
226 alto_str <<
"\t\t\t\t\t</TextBlock>\n";
230 if (last_word_in_cblock) {
231 alto_str <<
"\t\t\t\t</ComposedBlock>\n";
236 alto_str <<
"\t\t\t</PrintSpace>\n"
240 char* result =
new char[text.length() + 1];
241 strcpy(result, text.c_str());