30 #include "allheaders.h"
34 #include "pango/pango-font.h"
35 #include "pango/pango-glyph-item.h"
38 #include "unicode/uchar.h"
41 #ifdef USE_STD_NAMESPACE
50 static const int kDefaultOutputResolution = 300;
55 static const char* kWordJoinerUTF8 =
"\u2060";
56 static const char32 kWordJoiner = 0x2060;
58 static bool IsCombiner(
int ch) {
59 const int char_type = u_charType(ch);
60 return ((char_type == U_NON_SPACING_MARK) ||
61 (char_type == U_ENCLOSING_MARK) ||
62 (char_type == U_COMBINING_SPACING_MARK));
65 static string EncodeAsUTF8(
const char32 ch32) {
67 return string(uni_ch.utf8(), uni_ch.utf8_len());
71 static bool RandBool(
const double prob, TRand* rand) {
72 if (prob == 1.0)
return true;
73 if (prob == 0.0)
return false;
74 return rand->UnsignedRand(1.0) < prob;
79 if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
80 printf(
"Unexpected surface format %d\n",
81 cairo_image_surface_get_format(surface));
84 const int width = cairo_image_surface_get_width(surface);
85 const int height = cairo_image_surface_get_height(surface);
86 Pix* pix = pixCreate(width, height, 32);
87 int byte_stride = cairo_image_surface_get_stride(surface);
89 for (
int i = 0; i < height; ++i) {
90 memcpy(reinterpret_cast<unsigned char*>(pix->data + i * pix->wpl) + 1,
91 cairo_image_surface_get_data(surface) + i * byte_stride,
92 byte_stride - ((i == height - 1) ? 1 : 0));
99 : page_width_(page_width),
100 page_height_(page_height),
105 vertical_text_(false),
106 gravity_hint_strong_(false),
107 render_fullwidth_latin_(false),
108 underline_start_prob_(0),
109 underline_continuation_prob_(0),
110 underline_style_(PANGO_UNDERLINE_SINGLE),
111 drop_uncovered_chars_(true),
112 strip_unrenderable_words_(false),
113 add_ligatures_(false),
114 output_word_boxes_(false),
167 PangoContext* context = pango_layout_get_context(
layout_);
168 pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);
170 pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);
172 pango_layout_context_changed(
layout_);
181 PangoFontDescription *desc =
182 pango_font_description_from_string(font_desc.c_str());
184 pango_layout_set_font_description(
layout_, desc);
185 pango_font_description_free(desc);
186 pango_cairo_context_set_resolution(pango_layout_get_context(
layout_),
191 tlog(3,
"max_width = %d, max_height = %d\n", max_width, max_height);
193 swap(max_width, max_height);
195 pango_layout_set_width(
layout_, max_width * PANGO_SCALE);
196 pango_layout_set_wrap(
layout_, PANGO_WRAP_WORD);
199 PangoAttrList* attr_list = pango_attr_list_new();
201 PangoAttribute* spacing_attr = pango_attr_letter_spacing_new(
203 spacing_attr->start_index = 0;
204 spacing_attr->end_index =
static_cast<guint
>(-1);
205 pango_attr_list_change(attr_list, spacing_attr);
207 pango_layout_set_attributes(
layout_, attr_list);
208 pango_attr_list_unref(attr_list);
232 PangoAttrList* attr_list = pango_layout_get_attributes(
layout_);
234 const char* text = page_text.c_str();
237 bool started_underline =
false;
238 PangoAttribute* und_attr =
nullptr;
240 while (offset < page_text.length()) {
242 if (offset == page_text.length())
break;
244 int word_start = offset;
247 if (started_underline) {
251 und_attr->end_index = word_start + word_len;
255 pango_attr_list_insert(attr_list, und_attr);
256 started_underline =
false;
263 und_attr->start_index = word_start;
264 und_attr->end_index = word_start + word_len;
265 started_underline =
true;
269 if (started_underline) {
270 und_attr->end_index = page_text.length();
271 pango_attr_list_insert(attr_list, und_attr);
278 if (!text_length)
return 0;
281 const int max_layout_height =
vertical_text_ ? max_width : max_height;
285 const int kMaxUnicodeBufLength = 15000;
286 for (
int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i);
288 tlog(1,
"len = %d buf_len = %d\n", text_length, buf_length);
289 pango_layout_set_text(
layout_, text, buf_length);
291 PangoLayoutIter* line_iter =
NULL;
294 line_iter = pango_layout_get_iter(
layout_);
296 bool first_page =
true;
298 int offset = buf_length;
301 PangoRectangle line_ink_rect;
302 pango_layout_iter_get_line_extents(line_iter, &line_ink_rect,
NULL);
303 pango_extents_to_pixels(&line_ink_rect,
NULL);
304 PangoLayoutLine* line = pango_layout_iter_get_line_readonly(line_iter);
306 page_top = line_ink_rect.y;
309 int line_bottom = line_ink_rect.y + line_ink_rect.height;
310 if (line_bottom - page_top > max_layout_height) {
311 offset = line->start_index;
312 tlog(1,
"Found offset = %d\n", offset);
315 }
while (pango_layout_iter_next_line(line_iter));
316 pango_layout_iter_free(line_iter);
335 for (
int i = 0; i <
boxchars_.size(); ++i)
348 map<int, string> start_byte_to_text;
349 PangoLayoutIter* run_iter = pango_layout_get_iter(
layout_);
350 const char* full_text = pango_layout_get_text(
layout_);
352 PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
355 tlog(2,
"Found end of line marker\n");
358 PangoGlyphItemIter cluster_iter;
359 gboolean have_cluster;
360 for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
363 have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
364 const int start_byte_index = cluster_iter.start_index;
365 const int end_byte_index = cluster_iter.end_index;
366 string text = string(full_text + start_byte_index,
367 end_byte_index - start_byte_index);
369 tlog(2,
"Found whitespace\n");
372 tlog(2,
"start_byte=%d end_byte=%d : '%s'\n", start_byte_index,
373 end_byte_index, text.c_str());
379 start_byte_to_text[start_byte_index] = text;
381 }
while (pango_layout_iter_next_run(run_iter));
382 pango_layout_iter_free(run_iter);
384 cluster_text->clear();
385 for (map<int, string>::const_iterator it = start_byte_to_text.begin();
386 it != start_byte_to_text.end(); ++it) {
387 cluster_text->push_back(it->second);
389 return cluster_text->size();
402 static void MergeBoxCharsToWords(vector<BoxChar*>* boxchars) {
403 vector<BoxChar*> result;
404 bool started_word =
false;
405 for (
int i = 0; i < boxchars->size(); ++i) {
406 if (boxchars->at(i)->ch() ==
" " ||
407 boxchars->at(i)->box() ==
NULL) {
408 result.push_back(boxchars->at(i));
409 boxchars->at(i) =
NULL;
410 started_word =
false;
417 result.push_back(boxchars->at(i));
418 boxchars->at(i) =
NULL;
420 BoxChar* last_boxchar = result.back();
422 const Box* box = boxchars->at(i)->box();
423 Box* last_box = last_boxchar->mutable_box();
424 int left = min(last_box->x, box->x);
425 int right = max(last_box->x + last_box->w, box->x + box->w);
426 int top = min(last_box->y, box->y);
427 int bottom = max(last_box->y + last_box->h, box->y + box->h);
431 if (right - left > last_box->w + 5 * box->w) {
432 tlog(1,
"Found line break after '%s'", last_boxchar->ch().c_str());
435 result.push_back(
new BoxChar(
" ", 1));
436 result.push_back(boxchars->at(i));
437 boxchars->at(i) =
NULL;
441 last_boxchar->mutable_ch()->append(boxchars->at(i)->ch());
443 last_box->w = right - left;
445 last_box->h = bottom - top;
446 delete boxchars->at(i);
447 boxchars->at(i) =
NULL;
450 boxchars->swap(result);
455 const char* text = pango_layout_get_text(
layout_);
456 PangoLayoutIter* cluster_iter = pango_layout_get_iter(
layout_);
459 vector<int> cluster_start_indices;
461 cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));
462 tlog(3,
"Added %d\n", cluster_start_indices.back());
463 }
while (pango_layout_iter_next_cluster(cluster_iter));
464 pango_layout_iter_free(cluster_iter);
465 cluster_start_indices.push_back(strlen(text));
466 tlog(3,
"Added last index %d\n", cluster_start_indices.back());
468 sort(cluster_start_indices.begin(), cluster_start_indices.end());
469 map<int, int> cluster_start_to_end_index;
470 for (
int i = 0; i < cluster_start_indices.size() - 1; ++i) {
471 cluster_start_to_end_index[cluster_start_indices[i]]
472 = cluster_start_indices[i + 1];
477 cluster_iter = pango_layout_get_iter(
layout_);
479 map<int, BoxChar*> start_byte_to_box;
481 PangoRectangle cluster_rect;
482 pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect,
484 pango_extents_to_pixels(&cluster_rect,
NULL);
485 const int start_byte_index = pango_layout_iter_get_index(cluster_iter);
486 const int end_byte_index = cluster_start_to_end_index[start_byte_index];
487 string cluster_text = string(text + start_byte_index,
488 end_byte_index - start_byte_index);
489 if (cluster_text.size() && cluster_text[0] ==
'\n') {
490 tlog(2,
"Skipping newlines at start of text.\n");
493 if (!cluster_rect.width || !cluster_rect.height ||
495 tlog(2,
"Skipping whitespace with boxdim (%d,%d) '%s'\n",
496 cluster_rect.width, cluster_rect.height, cluster_text.c_str());
498 boxchar->set_page(
page_);
499 start_byte_to_box[start_byte_index] = boxchar;
503 tlog(2,
"[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n",
504 cluster_rect.x, cluster_rect.y,
505 cluster_rect.width, cluster_rect.height,
506 start_byte_index, end_byte_index,
507 cluster_text.c_str());
509 "cluster_text:%s start_byte_index:%d\n",
510 cluster_text.c_str(), start_byte_index);
512 "cluster_text:%s start_byte_index:%d\n",
513 cluster_text.c_str(), start_byte_index);
525 BoxChar* boxchar =
new BoxChar(cluster_text.c_str(), cluster_text.size());
527 boxchar->
AddBox(cluster_rect.x, cluster_rect.y,
528 cluster_rect.width, cluster_rect.height);
529 start_byte_to_box[start_byte_index] = boxchar;
530 }
while (pango_layout_iter_next_cluster(cluster_iter));
531 pango_layout_iter_free(cluster_iter);
539 vector<string> cluster_text;
541 ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());
543 for (map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
544 it != start_byte_to_box.end(); ++it, ++ind) {
545 it->second->mutable_ch()->swap(cluster_text[ind]);
550 vector<BoxChar*> page_boxchars;
551 page_boxchars.reserve(start_byte_to_box.size());
553 for (map<int, BoxChar*>::const_iterator it = start_byte_to_box.begin();
554 it != start_byte_to_box.end(); ++it) {
555 if (it->second->ch() == kWordJoinerUTF8) {
559 page_boxchars.push_back(it->second);
565 for (map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
566 it != start_byte_to_box.end(); ++it) {
569 it->second->mutable_ch()->swap(half);
575 MergeBoxCharsToWords(&page_boxchars);
581 Box* page_box =
NULL;
582 Boxa* all_boxes =
NULL;
583 for (
int i = 0; i < page_boxchars.size(); ++i) {
584 if (page_boxchars[i]->box() ==
NULL)
continue;
585 if (all_boxes ==
NULL)
586 all_boxes = boxaCreate(0);
587 boxaAddBox(all_boxes, page_boxchars[i]->mutable_box(), L_CLONE);
589 boxaGetExtent(all_boxes,
NULL,
NULL, &page_box);
590 boxaDestroy(&all_boxes);
599 const double rotation = - pango_gravity_to_rotation(
600 pango_context_get_base_gravity(pango_layout_get_context(
layout_)));
603 0, boxchars->size(), boxchars);
611 const char* text = utf8_text->c_str();
614 while (offset < utf8_text->length()) {
616 output_text.append(text + offset, space_len);
618 if (offset == utf8_text->length())
break;
622 output_text.append(text + offset, word_len);
628 utf8_text->swap(output_text);
630 if (num_dropped > 0) {
631 tprintf(
"Stripped %d unrenderable words\n", num_dropped);
638 Pix *orig_pix =
NULL;
641 *pix = pixConvertTo8(orig_pix,
false);
642 pixDestroy(&orig_pix);
648 int threshold, Pix** pix) {
649 Pix *orig_pix =
NULL;
652 Pix* gray_pix = pixConvertTo8(orig_pix,
false);
653 pixDestroy(&orig_pix);
654 *pix = pixThresholdToBinary(gray_pix, threshold);
655 pixDestroy(&gray_pix);
670 it != it_end; ++it) {
672 out_str.append(it.utf8_data(), it.utf8_len());
676 bool next_char_is_boundary = (next_it == it_end || *next_it ==
' ');
677 bool next_char_is_combiner = (next_it == it_end) ?
678 false : IsCombiner(*next_it);
679 if (*it !=
' ' && *it !=
'\n' && !next_char_is_boundary &&
680 !next_char_is_combiner) {
681 out_str += kWordJoinerUTF8;
693 it != it_end; ++it) {
698 char32 full_char = *it + 0xFEE0;
699 full_str.append(EncodeAsUTF8(full_char));
701 full_str.append(it.utf8_data(), it.utf8_len());
712 it != it_end; ++it) {
717 isprint(half_char) && !isspace(half_char)) {
718 half_str.append(EncodeAsUTF8(half_char));
720 half_str.append(it.utf8_data(), it.utf8_len());
729 if (pix && *pix) pixDestroy(pix);
751 double rotation = - pango_gravity_to_rotation(
752 pango_context_get_base_gravity(pango_layout_get_context(
layout_)));
753 tlog(2,
"Rotating by %f radians\n", rotation);
754 cairo_rotate(
cr_, rotation);
757 string page_text(text, page_offset);
769 tprintf(
"WARNING: Dropped %d uncovered characters\n", num_dropped);
780 pango_layout_set_text(
layout_, page_text.c_str(), page_text.length());
784 cairo_set_source_rgb(
cr_, 1.0, 1.0, 1.0);
828 const char* text,
int text_length,
829 string* font_used, Pix** image) {
832 const char kTitleTemplate[] =
"%s : %d hits = %.2f%%, raw = %d = %.2f%%";
835 &title_font,
NULL)) {
836 tprintf(
"WARNING: Could not find a font to render image title with!\n");
837 title_font =
"Arial";
840 tlog(1,
"Selected title font: %s\n", title_font.c_str());
841 if (font_used) font_used->clear();
855 for (
int i =
font_index_; i < all_fonts.size(); ++i) {
860 if (ok_chars > 0 && ok_chars >=
total_chars_ * min_coverage) {
864 const int kMaxTitleLength = 1024;
865 char title[kMaxTitleLength];
866 snprintf(title, kMaxTitleLength, kTitleTemplate,
867 all_fonts[i].c_str(), ok_chars,
875 if (font_used) *font_used = all_fonts[i];
880 Pix* title_image =
NULL;
882 pixOr(*image, *image, title_image);
883 pixDestroy(&title_image);
890 tprintf(
"Font %s failed with %d hits = %.2f%%\n",
891 all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars /
total_chars_);
cairo_surface_t * surface_
bool render_fullwidth_latin_
#define DISABLE_HEAP_LEAK_CHECK
string AddLigatures(const string &str, const PangoFontInfo *font) const
void WriteAllBoxes(const string &filename)
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
int RenderToGrayscaleImage(const char *text, int text_length, Pix **pix)
bool IsUTF8Whitespace(const char *text)
PangoUnderline underline_style_
static void TranslateBoxes(int xshift, int yshift, vector< BoxChar * > *boxes)
void set_resolution(const int resolution)
static bool SelectFont(const char *utf8_word, const int utf8_len, string *font_name, vector< string > *graphemes)
void set_underline_start_prob(const double frac)
void RotatePageBoxes(float rotation)
void SetLayoutProperties()
int DropUncoveredChars(string *utf8_text) const
int RenderToBinaryImage(const char *text, int text_length, int threshold, Pix **pix)
int SpanUTF8Whitespace(const char *text)
#define ASSERT_HOST_MSG(x, msg...)
vector< BoxChar * > boxchars_
string DescriptionName() const
void AddBox(int x, int y, int width, int height)
Boxa * GetPageBoxes() const
int SpanUTF8NotWhitespace(const char *text)
int StripUnrenderableWords(string *utf8_text) const
void set_underline_continuation_prob(const double frac)
const vector< BoxChar * > & GetBoxes() const
static LigatureTable * Get()
static void WriteTesseractBoxFile(const string &name, int height, const vector< BoxChar * > &boxes)
bool strip_unrenderable_words_
hash_map< char32, inT64 > char_map_
static const_iterator begin(const char *utf8_str, const int byte_length)
bool ParseFontDescriptionName(const string &name)
int FindFirstPageBreakOffset(const char *text, int text_length)
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, vector< BoxChar * > *boxes)
void SetWordUnderlineAttributes(const string &page_text)
bool set_font(const string &desc)
bool drop_uncovered_chars_
static void PrepareToWrite(vector< BoxChar * > *boxes)
static string InsertWordJoiners(const string &text)
bool GetClusterStrings(vector< string > *cluster_text)
bool gravity_hint_strong_
static int FontScore(const unordered_map< char32, inT64 > &ch_map, const string &fontname, int *raw_score, vector< bool > *ch_flags)
static string ConvertBasicLatinToFullwidthLatin(const string &text)
Pix * CairoARGB32ToPixFormat(cairo_surface_t *surface)
StringRenderer(const string &font_desc, int page_width, int page_height)
static string ConvertFullwidthLatinToBasicLatin(const string &text)
bool CanRenderString(const char *utf8_word, int len, vector< string > *graphemes) const
double underline_continuation_prob_
int RenderAllFontsToImage(double min_coverage, const char *text, int text_length, string *font_used, Pix **pix)
void CorrectBoxPositionsToLayout(vector< BoxChar * > *boxchars)
const char * utf8_data() const
char32 FullwidthToHalfwidth(const char32 ch)
int RenderToImage(const char *text, int text_length, Pix **pix)
double underline_start_prob_
static const vector< string > & ListAvailableFonts()
static const_iterator end(const char *utf8_str, const int byte_length)
void set_resolution(const int resolution)
void ComputeClusterBoxes()
bool IsInterchangeValid7BitAscii(const char32 ch)