31 #include "allheaders.h"
35 #include "pango/pango-font.h"
36 #include "pango/pango-glyph-item.h"
39 #include "unicode/uchar.h"
44 static const int kDefaultOutputResolution = 300;
49 static const char* kWordJoinerUTF8 =
"\u2060";
51 static bool IsCombiner(
int ch) {
52 const int char_type = u_charType(ch);
53 return ((char_type == U_NON_SPACING_MARK) ||
54 (char_type == U_ENCLOSING_MARK) ||
55 (char_type == U_COMBINING_SPACING_MARK));
60 return std::string(uni_ch.utf8(), uni_ch.utf8_len());
64 static bool RandBool(
const double prob, TRand* rand) {
65 if (prob == 1.0)
return true;
66 if (prob == 0.0)
return false;
67 return rand->UnsignedRand(1.0) < prob;
71 static Pix* CairoARGB32ToPixFormat(cairo_surface_t *surface) {
72 if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
73 printf(
"Unexpected surface format %d\n",
74 cairo_image_surface_get_format(surface));
77 const int width = cairo_image_surface_get_width(surface);
78 const int height = cairo_image_surface_get_height(surface);
79 Pix* pix = pixCreate(width, height, 32);
80 int byte_stride = cairo_image_surface_get_stride(surface);
82 for (
int i = 0; i < height; ++i) {
83 memcpy(reinterpret_cast<unsigned char*>(pix->data + i * pix->wpl) + 1,
84 cairo_image_surface_get_data(surface) + i * byte_stride,
85 byte_stride - ((i == height - 1) ? 1 : 0));
93 page_width_(page_width),
94 page_height_(page_height),
97 pen_color_{0.0, 0.0, 0.0},
100 vertical_text_(
false),
101 gravity_hint_strong_(
false),
102 render_fullwidth_latin_(
false),
103 underline_start_prob_(0),
104 underline_continuation_prob_(0),
105 underline_style_(PANGO_UNDERLINE_SINGLE),
107 drop_uncovered_chars_(
true),
108 strip_unrenderable_words_(
false),
109 add_ligatures_(
false),
110 output_word_boxes_(
false),
117 page_boxes_(
nullptr),
121 set_resolution(kDefaultOutputResolution);
125 bool StringRenderer::set_font(
const std::string& desc) {
126 bool success = font_.ParseFontDescriptionName(desc);
127 font_.set_resolution(resolution_);
131 void StringRenderer::set_resolution(
const int resolution) {
132 resolution_ = resolution;
133 font_.set_resolution(resolution);
136 void StringRenderer::set_underline_start_prob(
const double frac) {
137 underline_start_prob_ = std::min(std::max(frac, 0.0), 1.0);
140 void StringRenderer::set_underline_continuation_prob(
const double frac) {
141 underline_continuation_prob_ = std::min(std::max(frac, 0.0), 1.0);
144 StringRenderer::~StringRenderer() {
150 void StringRenderer::InitPangoCairo() {
152 surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_,
154 cr_ = cairo_create(surface_);
157 layout_ = pango_cairo_create_layout(cr_);
160 if (vertical_text_) {
161 PangoContext* context = pango_layout_get_context(layout_);
162 pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);
163 if (gravity_hint_strong_) {
164 pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);
166 pango_layout_context_changed(layout_);
169 SetLayoutProperties();
172 void StringRenderer::SetLayoutProperties() {
175 PangoFontDescription *desc =
176 pango_font_description_from_string(font_desc.c_str());
178 pango_layout_set_font_description(layout_, desc);
179 pango_font_description_free(desc);
180 pango_cairo_context_set_resolution(pango_layout_get_context(layout_),
183 int max_width = page_width_ - 2 * h_margin_;
184 int max_height = page_height_ - 2 * v_margin_;
185 tlog(3,
"max_width = %d, max_height = %d\n", max_width, max_height);
186 if (vertical_text_) {
188 swap(max_width, max_height);
190 pango_layout_set_width(layout_, max_width * PANGO_SCALE);
192 pango_layout_set_wrap(layout_, PANGO_WRAP_WORD_CHAR);
195 PangoAttrList* attr_list = pango_attr_list_new();
197 PangoAttribute* spacing_attr =
198 pango_attr_letter_spacing_new(char_spacing_ * PANGO_SCALE);
199 spacing_attr->start_index = 0;
200 spacing_attr->end_index = static_cast<guint>(-1);
201 pango_attr_list_change(attr_list, spacing_attr);
203 #if (PANGO_VERSION_MAJOR == 1 && PANGO_VERSION_MINOR >= 38)
204 if (add_ligatures_) {
205 set_features(
"liga, clig, dlig, hlig");
206 PangoAttribute* feature_attr = pango_attr_font_features_new(features_);
207 pango_attr_list_change(attr_list, feature_attr);
210 pango_layout_set_attributes(layout_, attr_list);
211 pango_attr_list_unref(attr_list);
214 pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE);
218 void StringRenderer::FreePangoCairo() {
220 g_object_unref(layout_);
228 cairo_surface_destroy(surface_);
233 void StringRenderer::SetWordUnderlineAttributes(
const std::string& page_text) {
234 if (underline_start_prob_ == 0)
return;
235 PangoAttrList* attr_list = pango_layout_get_attributes(layout_);
237 const char* text = page_text.c_str();
240 bool started_underline =
false;
241 PangoAttribute* und_attr =
nullptr;
243 while (offset < page_text.length()) {
245 if (offset == page_text.length())
break;
247 int word_start = offset;
250 if (started_underline) {
252 if (RandBool(underline_continuation_prob_, &rand)) {
254 und_attr->end_index = word_start + word_len;
258 pango_attr_list_insert(attr_list, und_attr);
259 started_underline =
false;
263 if (!started_underline && RandBool(underline_start_prob_, &rand)) {
265 und_attr = pango_attr_underline_new(underline_style_);
266 und_attr->start_index = word_start;
267 und_attr->end_index = word_start + word_len;
268 started_underline =
true;
272 if (started_underline) {
273 und_attr->end_index = page_text.length();
274 pango_attr_list_insert(attr_list, und_attr);
279 int StringRenderer::FindFirstPageBreakOffset(
const char* text,
281 if (!text_length)
return 0;
282 const int max_height = (page_height_ - 2 * v_margin_);
283 const int max_width = (page_width_ - 2 * h_margin_);
284 const int max_layout_height = vertical_text_ ? max_width : max_height;
286 UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
287 const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length);
288 const int kMaxUnicodeBufLength = 15000;
289 for (
int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i);
290 int buf_length = it.utf8_data() - text;
291 tlog(1,
"len = %d buf_len = %d\n", text_length, buf_length);
292 pango_layout_set_text(layout_, text, buf_length);
294 PangoLayoutIter* line_iter =
nullptr;
297 line_iter = pango_layout_get_iter(layout_);
299 bool first_page =
true;
301 int offset = buf_length;
304 PangoRectangle line_ink_rect;
305 pango_layout_iter_get_line_extents(line_iter, &line_ink_rect,
nullptr);
306 pango_extents_to_pixels(&line_ink_rect,
nullptr);
307 PangoLayoutLine* line = pango_layout_iter_get_line_readonly(line_iter);
309 page_top = line_ink_rect.y;
312 int line_bottom = line_ink_rect.y + line_ink_rect.height;
313 if (line_bottom - page_top > max_layout_height) {
314 offset = line->start_index;
315 tlog(1,
"Found offset = %d\n", offset);
318 }
while (pango_layout_iter_next_line(line_iter));
319 pango_layout_iter_free(line_iter);
323 const std::vector<BoxChar*>& StringRenderer::GetBoxes()
const {
327 Boxa* StringRenderer::GetPageBoxes()
const {
331 void StringRenderer::RotatePageBoxes(
float rotation) {
332 BoxChar::RotateBoxes(rotation, page_width_ / 2, page_height_ / 2,
333 start_box_, boxchars_.size(), &boxchars_);
337 void StringRenderer::ClearBoxes() {
338 for (
size_t i = 0; i < boxchars_.size(); ++i)
delete boxchars_[i];
340 boxaDestroy(&page_boxes_);
344 BoxChar::PrepareToWrite(&boxchars_);
345 return BoxChar::GetTesseractBoxStr(page_height_, boxchars_);
349 BoxChar::PrepareToWrite(&boxchars_);
350 BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_);
354 bool StringRenderer::GetClusterStrings(std::vector<std::string>* cluster_text) {
355 std::map<int, std::string> start_byte_to_text;
356 PangoLayoutIter* run_iter = pango_layout_get_iter(layout_);
357 const char* full_text = pango_layout_get_text(layout_);
359 PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
362 tlog(2,
"Found end of line marker\n");
365 PangoGlyphItemIter cluster_iter;
366 gboolean have_cluster;
367 for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
370 have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
371 const int start_byte_index = cluster_iter.start_index;
372 const int end_byte_index = cluster_iter.end_index;
374 end_byte_index - start_byte_index);
376 tlog(2,
"Found whitespace\n");
379 tlog(2,
"start_byte=%d end_byte=%d : '%s'\n", start_byte_index,
380 end_byte_index, text.c_str());
381 if (add_ligatures_) {
384 text = LigatureTable::Get()->AddLigatures(text,
nullptr);
386 start_byte_to_text[start_byte_index] = text;
388 }
while (pango_layout_iter_next_run(run_iter));
389 pango_layout_iter_free(run_iter);
391 cluster_text->clear();
392 for (std::map<int, std::string>::const_iterator it = start_byte_to_text.begin();
393 it != start_byte_to_text.end(); ++it) {
394 cluster_text->push_back(it->second);
396 return !cluster_text->empty();
409 static void MergeBoxCharsToWords(std::vector<BoxChar*>* boxchars) {
410 std::vector<BoxChar*> result;
411 bool started_word =
false;
412 for (
size_t i = 0; i < boxchars->size(); ++i) {
413 if (boxchars->at(i)->ch() ==
" " || boxchars->at(i)->box() ==
nullptr) {
414 result.push_back(boxchars->at(i));
415 boxchars->at(i) =
nullptr;
416 started_word =
false;
423 result.push_back(boxchars->at(i));
424 boxchars->at(i) =
nullptr;
426 BoxChar* last_boxchar = result.back();
428 const Box* box = boxchars->at(i)->box();
429 Box* last_box = last_boxchar->mutable_box();
430 int left = std::min(last_box->x, box->x);
431 int right = std::max(last_box->x + last_box->w, box->x + box->w);
432 int top = std::min(last_box->y, box->y);
433 int bottom = std::max(last_box->y + last_box->h, box->y + box->h);
437 if (right - left > last_box->w + 5 * box->w) {
438 tlog(1,
"Found line break after '%s'", last_boxchar->ch().c_str());
441 result.push_back(
new BoxChar(
" ", 1));
442 result.push_back(boxchars->at(i));
443 boxchars->at(i) =
nullptr;
447 last_boxchar->mutable_ch()->append(boxchars->at(i)->ch());
449 last_box->w = right - left;
451 last_box->h = bottom - top;
452 delete boxchars->at(i);
453 boxchars->at(i) =
nullptr;
456 boxchars->swap(result);
460 void StringRenderer::ComputeClusterBoxes() {
461 const char* text = pango_layout_get_text(layout_);
462 PangoLayoutIter* cluster_iter = pango_layout_get_iter(layout_);
465 std::vector<int> cluster_start_indices;
467 cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));
468 tlog(3,
"Added %d\n", cluster_start_indices.back());
469 }
while (pango_layout_iter_next_cluster(cluster_iter));
470 pango_layout_iter_free(cluster_iter);
471 cluster_start_indices.push_back(strlen(text));
472 tlog(3,
"Added last index %d\n", cluster_start_indices.back());
474 std::sort(cluster_start_indices.begin(), cluster_start_indices.end());
475 std::map<int, int> cluster_start_to_end_index;
476 for (
size_t i = 0; i + 1 < cluster_start_indices.size(); ++i) {
477 cluster_start_to_end_index[cluster_start_indices[i]]
478 = cluster_start_indices[i + 1];
483 cluster_iter = pango_layout_get_iter(layout_);
485 std::map<int, BoxChar*> start_byte_to_box;
487 PangoRectangle cluster_rect;
488 pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect,
nullptr);
489 pango_extents_to_pixels(&cluster_rect,
nullptr);
490 const int start_byte_index = pango_layout_iter_get_index(cluster_iter);
491 const int end_byte_index = cluster_start_to_end_index[start_byte_index];
493 end_byte_index - start_byte_index);
494 if (!cluster_text.empty() && cluster_text[0] ==
'\n') {
495 tlog(2,
"Skipping newlines at start of text.\n");
498 if (!cluster_rect.width || !cluster_rect.height ||
500 tlog(2,
"Skipping whitespace with boxdim (%d,%d) '%s'\n",
501 cluster_rect.width, cluster_rect.height, cluster_text.c_str());
502 BoxChar* boxchar =
new BoxChar(
" ", 1);
503 boxchar->set_page(page_);
504 start_byte_to_box[start_byte_index] = boxchar;
508 tlog(2,
"[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n",
509 cluster_rect.x, cluster_rect.y,
510 cluster_rect.width, cluster_rect.height,
511 start_byte_index, end_byte_index,
512 cluster_text.c_str());
514 "cluster_text:%s start_byte_index:%d\n",
515 cluster_text.c_str(), start_byte_index);
517 "cluster_text:%s start_byte_index:%d\n",
518 cluster_text.c_str(), start_byte_index);
520 cluster_rect.x = std::max(0, cluster_rect.x - box_padding_);
521 cluster_rect.width += 2 * box_padding_;
522 cluster_rect.y = std::max(0, cluster_rect.y - box_padding_);
523 cluster_rect.height += 2 * box_padding_;
525 if (add_ligatures_) {
528 cluster_text = LigatureTable::Get()->AddLigatures(cluster_text,
nullptr);
530 BoxChar* boxchar =
new BoxChar(cluster_text.c_str(), cluster_text.size());
531 boxchar->set_page(page_);
532 boxchar->AddBox(cluster_rect.x, cluster_rect.y,
533 cluster_rect.width, cluster_rect.height);
534 start_byte_to_box[start_byte_index] = boxchar;
535 }
while (pango_layout_iter_next_cluster(cluster_iter));
536 pango_layout_iter_free(cluster_iter);
544 std::vector<std::string> cluster_text;
545 if (GetClusterStrings(&cluster_text)) {
546 ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());
548 for (std::map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
549 it != start_byte_to_box.end(); ++it, ++ind) {
550 it->second->mutable_ch()->swap(cluster_text[ind]);
555 std::vector<BoxChar*> page_boxchars;
556 page_boxchars.reserve(start_byte_to_box.size());
558 for (std::map<int, BoxChar*>::const_iterator it = start_byte_to_box.begin();
559 it != start_byte_to_box.end(); ++it) {
560 if (it->second->ch() == kWordJoinerUTF8) {
564 page_boxchars.push_back(it->second);
567 CorrectBoxPositionsToLayout(&page_boxchars);
569 if (render_fullwidth_latin_) {
570 for (std::map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
571 it != start_byte_to_box.end(); ++it) {
573 std::string half(ConvertFullwidthLatinToBasicLatin(it->second->ch()));
574 it->second->mutable_ch()->swap(half);
579 if (output_word_boxes_) {
580 MergeBoxCharsToWords(&page_boxchars);
583 boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end());
586 Box* page_box =
nullptr;
587 Boxa* all_boxes =
nullptr;
588 for (
size_t i = 0; i < page_boxchars.size(); ++i) {
589 if (page_boxchars[i]->box() ==
nullptr)
continue;
590 if (all_boxes ==
nullptr) all_boxes = boxaCreate(0);
591 boxaAddBox(all_boxes, page_boxchars[i]->mutable_box(), L_CLONE);
593 if (all_boxes !=
nullptr) {
594 boxaGetExtent(all_boxes,
nullptr,
nullptr, &page_box);
595 boxaDestroy(&all_boxes);
596 if (page_boxes_ ==
nullptr) page_boxes_ = boxaCreate(0);
597 boxaAddBox(page_boxes_, page_box, L_INSERT);
602 void StringRenderer::CorrectBoxPositionsToLayout(
603 std::vector<BoxChar*>* boxchars) {
604 if (vertical_text_) {
605 const double rotation = - pango_gravity_to_rotation(
606 pango_context_get_base_gravity(pango_layout_get_context(layout_)));
607 BoxChar::TranslateBoxes(page_width_ - h_margin_, v_margin_, boxchars);
608 BoxChar::RotateBoxes(rotation, page_width_ - h_margin_, v_margin_,
609 0, boxchars->size(), boxchars);
611 BoxChar::TranslateBoxes(h_margin_, v_margin_, boxchars);
615 int StringRenderer::StripUnrenderableWords(
std::string* utf8_text)
const {
617 const char* text = utf8_text->c_str();
620 while (offset < utf8_text->length()) {
622 output_text.append(text + offset, space_len);
624 if (offset == utf8_text->length())
break;
627 if (font_.CanRenderString(text + offset, word_len)) {
628 output_text.append(text + offset, word_len);
634 utf8_text->swap(output_text);
636 if (num_dropped > 0) {
637 tprintf(
"Stripped %d unrenderable words\n", num_dropped);
642 int StringRenderer::RenderToGrayscaleImage(
const char* text,
int text_length,
644 Pix* orig_pix =
nullptr;
645 int offset = RenderToImage(text, text_length, &orig_pix);
647 *pix = pixConvertTo8(orig_pix,
false);
648 pixDestroy(&orig_pix);
653 int StringRenderer::RenderToBinaryImage(
const char* text,
int text_length,
654 int threshold, Pix** pix) {
655 Pix* orig_pix =
nullptr;
656 int offset = RenderToImage(text, text_length, &orig_pix);
658 Pix* gray_pix = pixConvertTo8(orig_pix,
false);
659 pixDestroy(&orig_pix);
660 *pix = pixThresholdToBinary(gray_pix, threshold);
661 pixDestroy(&gray_pix);
676 it != it_end; ++it) {
678 out_str.append(it.utf8_data(), it.utf8_len());
682 bool next_char_is_boundary = (next_it == it_end || *next_it ==
' ');
683 bool next_char_is_combiner = (next_it == it_end) ?
684 false : IsCombiner(*next_it);
685 if (*it !=
' ' && *it !=
'\n' && !next_char_is_boundary &&
686 !next_char_is_combiner) {
687 out_str += kWordJoinerUTF8;
696 const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(),
698 for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
699 it != it_end; ++it) {
704 char32 full_char = *it + 0xFEE0;
705 full_str.append(EncodeAsUTF8(full_char));
707 full_str.append(it.utf8_data(), it.utf8_len());
718 it != it_end; ++it) {
723 isprint(half_char) && !isspace(half_char)) {
724 half_str.append(EncodeAsUTF8(half_char));
726 half_str.append(it.utf8_data(), it.utf8_len());
733 int StringRenderer::RenderToImage(
const char* text,
int text_length,
735 if (pix && *pix) pixDestroy(pix);
738 const int page_offset = FindFirstPageBreakOffset(text, text_length);
742 start_box_ = boxchars_.size();
744 if (!vertical_text_) {
746 cairo_translate(cr_, h_margin_, v_margin_);
755 cairo_translate(cr_, page_width_ - h_margin_, v_margin_);
757 double rotation = - pango_gravity_to_rotation(
758 pango_context_get_base_gravity(pango_layout_get_context(layout_)));
759 tlog(2,
"Rotating by %f radians\n", rotation);
760 cairo_rotate(cr_, rotation);
761 pango_cairo_update_layout(cr_, layout_);
764 if (render_fullwidth_latin_) {
766 page_text = ConvertBasicLatinToFullwidthLatin(page_text);
768 if (strip_unrenderable_words_) {
769 StripUnrenderableWords(&page_text);
771 if (drop_uncovered_chars_ &&
772 !font_.CoversUTF8Text(page_text.c_str(), page_text.length())) {
773 int num_dropped = font_.DropUncoveredChars(&page_text);
775 tprintf(
"WARNING: Dropped %d uncovered characters\n", num_dropped);
778 if (add_ligatures_) {
780 page_text = LigatureTable::Get()->AddLigatures(page_text, &font_);
782 if (underline_start_prob_ > 0) {
783 SetWordUnderlineAttributes(page_text);
786 pango_layout_set_text(layout_, page_text.c_str(), page_text.length());
790 cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0);
795 cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]);
798 pango_cairo_update_layout(cr_, layout_);
802 pango_cairo_show_layout(cr_, layout_);
804 *pix = CairoARGB32ToPixFormat(surface_);
806 ComputeClusterBoxes();
834 int StringRenderer::RenderAllFontsToImage(
double min_coverage,
835 const char* text,
int text_length,
839 const char kTitleTemplate[] =
"%s : %d hits = %.2f%%, raw = %d = %.2f%%";
841 if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate),
842 &title_font,
nullptr)) {
843 tprintf(
"WARNING: Could not find a font to render image title with!\n");
844 title_font =
"Arial";
847 tlog(1,
"Selected title font: %s\n", title_font.c_str());
848 if (font_used) font_used->clear();
851 if (char_map_.empty()) {
855 it != UNICHAR::end(text, text_length); ++it) {
859 tprintf(
"Total chars = %d\n", total_chars_);
861 const std::vector<std::string>& all_fonts = FontUtils::ListAvailableFonts();
863 for (
size_t i = font_index_; i < all_fonts.size(); ++i) {
867 FontUtils::FontScore(char_map_, all_fonts[i], &raw_score,
nullptr);
868 if (ok_chars > 0 && ok_chars >= total_chars_ * min_coverage) {
869 set_font(all_fonts[i]);
870 int offset = RenderToBinaryImage(text, text_length, 128, image);
872 const int kMaxTitleLength = 1024;
873 char title[kMaxTitleLength];
874 snprintf(title, kMaxTitleLength, kTitleTemplate,
875 all_fonts[i].c_str(), ok_chars,
876 100.0 * ok_chars / total_chars_, raw_score,
877 100.0 * raw_score / char_map_.size());
882 last_offset_ = offset;
883 if (font_used) *font_used = all_fonts[i];
886 set_font(title_font);
888 Pix* title_image =
nullptr;
889 RenderToBinaryImage(title, strlen(title), 128, &title_image);
890 pixOr(*image, *image, title_image);
891 pixDestroy(&title_image);
898 tprintf(
"Font %s failed with %d hits = %.2f%%\n",
899 all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_);
904 return last_offset_ == 0 ? -1 : last_offset_;