22 #include "config_auto.h"
27 #undef __STRICT_ANSI__
33 #include <sys/param.h>
43 #include "pango/pango.h"
44 #include "pango/pangocairo.h"
45 #include "pango/pangofc-font.h"
48 "Overrides system default font location");
50 "Overrides fontconfig default temporary dir");
52 "Does a one-time deletion of cache files from the "
53 "fontconfig_tmpdir before initializing fontconfig.");
55 "Does a one-time reset of the fontconfig config file to point"
56 " to fonts_dir before initializing fontconfig. Set to true"
57 " if fontconfig_refresh_cache is true. Set it to false to use"
58 " multiple instances in separate processes without having to"
59 " rescan the fonts_dir, using a previously setup font cache");
61 #ifndef USE_STD_NAMESPACE
62 #include "ocr/trainingdata/typesetting/legacy_fonts.h"
64 "Overrides --fonts_dir and sets the known universe of fonts to"
65 "the list in legacy_fonts.h");
76 bool PangoFontInfo::fontconfig_initialized_ =
false;
83 : desc_(
NULL), resolution_(kDefaultResolution) {
85 tprintf(
"ERROR: Could not parse %s\n", desc.c_str());
90 void PangoFontInfo::Clear() {
94 is_smallcaps_ =
false;
95 is_monospace_ =
false;
99 pango_font_description_free(desc_);
105 if (!desc_)
return "";
106 char* desc_str = pango_font_description_to_string(desc_);
107 string desc_name(desc_str);
118 if ((fontconfig_initialized_ && !force_clear) || fonts_dir.empty()) {
119 fontconfig_initialized_ =
true;
122 if (FLAGS_fontconfig_refresh_cache || force_clear) {
124 FLAGS_fontconfig_tmpdir.c_str(),
"*cache-?").c_str());
126 if (FLAGS_fontconfig_refresh_config_file || FLAGS_fontconfig_refresh_cache ||
128 const int MAX_FONTCONF_FILESIZE = 1024;
129 char fonts_conf_template[MAX_FONTCONF_FILESIZE];
130 snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
131 "<?xml version=\"1.0\"?>\n"
132 "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
135 "<cachedir>%s</cachedir>\n"
136 "<config></config>\n"
137 "</fontconfig>", fonts_dir.c_str(),
138 FLAGS_fontconfig_tmpdir.c_str());
139 string fonts_conf_file =
File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
144 std::string env(
"FONTCONFIG_PATH=");
145 env.append(FLAGS_fontconfig_tmpdir.c_str());
147 putenv(
"LANG=en_US.utf8");
149 setenv(
"FONTCONFIG_PATH", FLAGS_fontconfig_tmpdir.c_str(),
true);
151 setenv(
"LANG",
"en_US.utf8",
true);
153 if (!fontconfig_initialized_ || force_clear) {
154 if (FcInitReinitialize() != FcTrue) {
155 tprintf(
"FcInitiReinitialize failed!!\n");
158 fontconfig_initialized_ =
true;
162 static void ListFontFamilies(PangoFontFamily*** families,
165 PangoFontMap* font_map = pango_cairo_font_map_get_default();
167 pango_font_map_list_families(font_map, families, n_families);
172 static bool IsMonospaceFontFamily(
const char* family_name) {
173 PangoFontFamily** families = 0;
175 bool is_monospace =
false;
176 ListFontFamilies(&families, &n_families);
179 for (
int i = 0; i < n_families; ++i) {
180 if (!strcasecmp(family_name, pango_font_family_get_name(families[i]))) {
181 is_monospace = pango_font_family_is_monospace(families[i]);
187 tlog(1,
"Could not find monospace property of family %s\n", family_name);
193 bool PangoFontInfo::ParseFontDescription(
const PangoFontDescription *desc) {
195 const char* family = pango_font_description_get_family(desc);
197 char* desc_str = pango_font_description_to_string(desc);
198 tprintf(
"WARNING: Could not parse family name from description: '%s'\n",
203 family_name_ = string(family);
204 desc_ = pango_font_description_copy(desc);
205 is_monospace_ = IsMonospaceFontFamily(family);
208 font_size_ = pango_font_description_get_size(desc);
209 if (!pango_font_description_get_size_is_absolute(desc)) {
210 font_size_ /= PANGO_SCALE;
213 PangoStyle style = pango_font_description_get_style(desc);
214 is_italic_ = (PANGO_STYLE_ITALIC == style ||
215 PANGO_STYLE_OBLIQUE == style);
216 is_smallcaps_ = (pango_font_description_get_variant(desc)
217 == PANGO_VARIANT_SMALL_CAPS);
219 is_bold_ = (pango_font_description_get_weight(desc) >= PANGO_WEIGHT_BOLD);
228 PangoFontDescription *desc = pango_font_description_from_string(name.c_str());
229 bool success = ParseFontDescription(desc);
230 pango_font_description_free(desc);
237 PangoFont* PangoFontInfo::ToPangoFont()
const {
239 PangoFontMap* font_map = pango_cairo_font_map_get_default();
240 PangoContext* context = pango_context_new();
241 pango_cairo_context_set_resolution(context, resolution_);
242 pango_context_set_font_map(context, font_map);
243 PangoFont* font =
NULL;
246 font = pango_font_map_load_font(font_map, context, desc_);
248 g_object_unref(context);
253 PangoFont* font = ToPangoFont();
254 PangoCoverage* coverage = pango_font_get_coverage(font,
NULL);
260 if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
262 int len = it.get_utf8(tmp);
264 tlog(2,
"'%s' (U+%x) not covered by font\n", tmp, *it);
273 static char* my_strnmove(
char* dest,
const char* src,
size_t n) {
282 }
while (n && src[0]);
294 PangoFont* font = ToPangoFont();
295 PangoCoverage* coverage = pango_font_get_coverage(font,
NULL);
296 int num_dropped_chars = 0;
300 char* out =
const_cast<char*
>(utf8_text->c_str());
307 if (!it.is_legal()) {
313 const char* utf8_char = it.utf8_data();
316 if (!
IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&
317 pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {
321 tlog(2,
"'%s' (U+%x) not covered by font\n", str, unicode);
327 my_strnmove(out, utf8_char, utf8_len);
330 utf8_text->resize(out - utf8_text->c_str());
331 return num_dropped_chars;
335 int* x_bearing,
int* x_advance)
const {
337 PangoFont* font = ToPangoFont();
339 int total_advance = 0;
349 PangoGlyph glyph_index = pango_fc_font_get_glyph(
350 reinterpret_cast<PangoFcFont*>(font), *it);
356 PangoRectangle ink_rect, logical_rect;
357 pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect);
358 pango_extents_to_pixels(&ink_rect,
NULL);
359 pango_extents_to_pixels(&logical_rect,
NULL);
361 int bearing = total_advance + PANGO_LBEARING(ink_rect);
362 if (it == it_begin || bearing < min_bearing) {
363 min_bearing = bearing;
365 total_advance += PANGO_RBEARING(logical_rect);
367 *x_bearing = min_bearing;
368 *x_advance = total_advance;
373 vector<string> graphemes;
378 vector<string>* graphemes)
const {
379 if (graphemes) graphemes->clear();
388 const char32 kDottedCircleGlyph = 9676;
389 bool bad_glyph =
false;
390 PangoFontMap* font_map = pango_cairo_font_map_get_default();
391 PangoContext* context = pango_context_new();
392 pango_context_set_font_map(context, font_map);
397 layout = pango_layout_new(context);
400 pango_layout_set_font_description(layout, desc_);
402 PangoFontDescription *desc = pango_font_description_from_string(
404 pango_layout_set_font_description(layout, desc);
405 pango_font_description_free(desc);
407 pango_layout_set_text(layout, utf8_word, len);
408 PangoLayoutIter* run_iter =
NULL;
411 run_iter = pango_layout_get_iter(layout);
414 PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
416 tlog(2,
"Found end of line NULL run marker\n");
419 PangoGlyph dotted_circle_glyph;
420 PangoFont* font = run->item->analysis.font;
421 dotted_circle_glyph = pango_fc_font_get_glyph(
422 reinterpret_cast<PangoFcFont*>(font), kDottedCircleGlyph);
424 PangoFontDescription* desc = pango_font_describe(font);
425 char* desc_str = pango_font_description_to_string(desc);
426 tlog(2,
"Desc of font in run: %s\n", desc_str);
428 pango_font_description_free(desc);
431 PangoGlyphItemIter cluster_iter;
432 gboolean have_cluster;
433 for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
435 have_cluster && !bad_glyph;
436 have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
437 const int start_byte_index = cluster_iter.start_index;
438 const int end_byte_index = cluster_iter.end_index;
439 int start_glyph_index = cluster_iter.start_glyph;
440 int end_glyph_index = cluster_iter.end_glyph;
441 string cluster_text = string(utf8_word + start_byte_index,
442 end_byte_index - start_byte_index);
443 if (graphemes) graphemes->push_back(cluster_text);
445 tlog(2,
"Skipping whitespace\n");
449 printf(
"start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d ",
450 start_byte_index, end_byte_index,
451 start_glyph_index, end_glyph_index);
453 for (
int i = start_glyph_index,
454 step = (end_glyph_index > start_glyph_index) ? 1 : -1;
455 !bad_glyph && i != end_glyph_index; i+= step) {
456 const bool unknown_glyph =
457 (cluster_iter.glyph_item->glyphs->glyphs[i].glyph &
458 PANGO_GLYPH_UNKNOWN_FLAG);
459 const bool illegal_glyph =
460 (cluster_iter.glyph_item->glyphs->glyphs[i].glyph ==
461 dotted_circle_glyph);
462 bad_glyph = unknown_glyph || illegal_glyph;
464 printf(
"(%d=%d)", cluster_iter.glyph_item->glyphs->glyphs[i].glyph,
469 printf(
" '%s'\n", cluster_text.c_str());
472 tlog(1,
"Found illegal glyph!\n");
474 }
while (!bad_glyph && pango_layout_iter_next_run(run_iter));
476 pango_layout_iter_free(run_iter);
477 g_object_unref(context);
478 g_object_unref(layout);
479 if (bad_glyph && graphemes) graphemes->clear();
485 vector<string> FontUtils::available_fonts_;
498 string* best_match) {
499 string query_desc(input_query_desc);
500 if (PANGO_VERSION <= 12005) {
502 query_desc.erase(std::remove(query_desc.begin(), query_desc.end(),
','),
504 const string kMediumStr =
" Medium";
505 std::size_t found = query_desc.find(kMediumStr);
506 if (found != std::string::npos) {
507 query_desc.erase(found, kMediumStr.length());
511 PangoFontDescription *desc = pango_font_description_from_string(
513 PangoFont* selected_font =
NULL;
516 PangoFontMap* font_map = pango_cairo_font_map_get_default();
517 PangoContext* context = pango_context_new();
518 pango_context_set_font_map(context, font_map);
521 selected_font = pango_font_map_load_font(font_map, context, desc);
523 g_object_unref(context);
525 if (selected_font ==
NULL) {
526 pango_font_description_free(desc);
529 PangoFontDescription* selected_desc = pango_font_describe(selected_font);
531 bool equal = pango_font_description_equal(desc, selected_desc);
532 tlog(3,
"query weight = %d \t selected weight =%d\n",
533 pango_font_description_get_weight(desc),
534 pango_font_description_get_weight(selected_desc));
536 char* selected_desc_str = pango_font_description_to_string(selected_desc);
537 tlog(2,
"query_desc: '%s' Selected: 's'\n", query_desc.c_str(),
539 if (!equal && best_match !=
NULL) {
540 *best_match = selected_desc_str;
543 int len = best_match->size();
544 if (len > 2 && best_match->at(len - 1) ==
'0' &&
545 best_match->at(len - 2) ==
' ') {
546 *best_match = best_match->substr(0, len - 2);
549 g_free(selected_desc_str);
550 pango_font_description_free(selected_desc);
551 g_object_unref(selected_font);
552 pango_font_description_free(desc);
556 static bool ShouldIgnoreFontFamilyName(
const char* query) {
557 static const char* kIgnoredFamilyNames[]
558 = {
"Sans",
"Serif",
"Monospace",
NULL };
559 const char** list = kIgnoredFamilyNames;
560 for (; *list !=
NULL; ++list) {
561 if (!strcmp(*list, query))
570 if (available_fonts_.size()) {
571 return available_fonts_;
573 #ifndef USE_STD_NAMESPACE
574 if (FLAGS_use_only_legacy_fonts) {
576 tprintf(
"Using list of legacy fonts only\n");
577 const int kNumFontLists = 4;
578 for (
int i = 0; i < kNumFontLists; ++i) {
579 for (
int j = 0; kFontlists[i][j] !=
NULL; ++j) {
580 available_fonts_.push_back(kFontlists[i][j]);
583 return available_fonts_;
587 PangoFontFamily** families = 0;
589 ListFontFamilies(&families, &n_families);
590 for (
int i = 0; i < n_families; ++i) {
591 const char* family_name = pango_font_family_get_name(families[i]);
592 tlog(2,
"Listing family %s\n", family_name);
593 if (ShouldIgnoreFontFamilyName(family_name)) {
598 PangoFontFace** faces =
NULL;
599 pango_font_family_list_faces(families[i], &faces, &n_faces);
600 for (
int j = 0; j < n_faces; ++j) {
601 PangoFontDescription* desc = pango_font_face_describe(faces[j]);
602 char* desc_str = pango_font_description_to_string(desc);
604 available_fonts_.push_back(desc_str);
606 pango_font_description_free(desc);
612 sort(available_fonts_.begin(), available_fonts_.end());
613 return available_fonts_;
617 static void CharCoverageMapToBitmap(PangoCoverage* coverage,
618 vector<bool>* unichar_bitmap) {
619 const int kMinUnicodeValue = 33;
620 const int kMaxUnicodeValue = 0x10FFFF;
621 unichar_bitmap->resize(kMaxUnicodeValue + 1,
false);
623 for (
int i = kMinUnicodeValue; i <= kMaxUnicodeValue; ++i) {
626 = (pango_coverage_get(coverage, i) == PANGO_COVERAGE_EXACT);
639 vector<bool>* unichar_bitmap) {
641 PangoCoverage* coverage = pango_font_get_coverage(
642 font_info.ToPangoFont(),
NULL);
643 CharCoverageMapToBitmap(coverage, unichar_bitmap);
648 vector<bool>* unichar_bitmap) {
650 PangoCoverage* all_coverage = pango_coverage_new();
651 tlog(1,
"Processing %d fonts\n", fonts.size());
652 for (
int i = 0; i < fonts.size(); ++i) {
654 PangoCoverage* coverage = pango_font_get_coverage(
655 font_info.ToPangoFont(),
NULL);
657 pango_coverage_max(all_coverage, coverage);
659 CharCoverageMapToBitmap(all_coverage, unichar_bitmap);
660 pango_coverage_unref(all_coverage);
668 const string& fontname,
670 vector<bool>* ch_flags) {
673 tprintf(
"ERROR: Could not parse %s\n", fontname.c_str());
675 PangoFont* font = font_info.ToPangoFont();
676 PangoCoverage* coverage = pango_font_get_coverage(font,
NULL);
680 ch_flags->reserve(ch_map.size());
684 for (unordered_map<char32, inT64>::const_iterator it = ch_map.begin();
685 it != ch_map.end(); ++it) {
687 (pango_coverage_get(coverage, it->first)
688 == PANGO_COVERAGE_EXACT));
691 ok_chars += it->second;
694 ch_flags->push_back(covered);
703 vector<pair<
const char*, vector<bool> > >* fonts) {
704 const double kMinOKFraction = 0.99;
707 const double kMinWeightedFraction = 0.99995;
710 vector<vector<bool> > font_flags;
711 vector<int> font_scores;
712 vector<int> raw_scores;
713 int most_ok_chars = 0;
714 int best_raw_score = 0;
716 for (
int i = 0; i < font_names.size(); ++i) {
717 vector<bool> ch_flags;
719 int ok_chars =
FontScore(ch_map, font_names[i], &raw_score, &ch_flags);
720 most_ok_chars =
MAX(ok_chars, most_ok_chars);
721 best_raw_score =
MAX(raw_score, best_raw_score);
723 font_flags.push_back(ch_flags);
724 font_scores.push_back(ok_chars);
725 raw_scores.push_back(raw_score);
736 int least_good_enough =
static_cast<int>(most_ok_chars * kMinOKFraction);
737 int least_raw_enough =
static_cast<int>(best_raw_score * kMinOKFraction);
738 int override_enough =
static_cast<int>(most_ok_chars * kMinWeightedFraction);
741 for (
int i = 0; i < font_names.size(); ++i) {
742 int score = font_scores[i];
743 int raw_score = raw_scores[i];
744 if ((score >= least_good_enough && raw_score >= least_raw_enough) ||
745 score >= override_enough) {
746 fonts->push_back(make_pair(font_names[i].c_str(), font_flags[i]));
747 tlog(1,
"OK font %s = %.4f%%, raw = %d = %.2f%%\n",
748 font_names[i].c_str(),
749 100.0 * score / most_ok_chars,
750 raw_score, 100.0 * raw_score / best_raw_score);
751 font_list += font_names[i];
753 }
else if (score >= least_good_enough || raw_score >= least_raw_enough) {
754 tlog(1,
"Runner-up font %s = %.4f%%, raw = %d = %.2f%%\n",
755 font_names[i].c_str(),
756 100.0 * score / most_ok_chars,
757 raw_score, 100.0 * raw_score / best_raw_score);
765 string* font_name, vector<string>* graphemes) {
772 const vector<string>& all_fonts,
773 string* font_name, vector<string>* graphemes) {
774 if (font_name) font_name->clear();
775 if (graphemes) graphemes->clear();
776 for (
int i = 0; i < all_fonts.size(); ++i) {
778 vector<string> found_graphemes;
780 "Could not parse font desc name %s\n",
781 all_fonts[i].c_str());
783 if (graphemes) graphemes->swap(found_graphemes);
784 if (font_name) *font_name = all_fonts[i];
static string BestFonts(const unordered_map< char32, inT64 > &ch_map, vector< std::pair< const char *, vector< bool > > > *font_flag)
#define DISABLE_HEAP_LEAK_CHECK
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
bool IsUTF8Whitespace(const char *text)
bool GetSpacingProperties(const string &utf8_char, int *x_bearing, int *x_advance) const
static bool SelectFont(const char *utf8_word, const int utf8_len, string *font_name, vector< string > *graphemes)
static string JoinPath(const string &prefix, const string &suffix)
static bool IsAvailableFont(const char *font_desc)
static void WriteStringToFileOrDie(const string &str, const string &filename)
int DropUncoveredChars(string *utf8_text) const
#define ASSERT_HOST_MSG(x, msg...)
static bool DeleteMatchingFiles(const char *pattern)
const int kDefaultResolution
Default resolution used if input in not believable.
string DescriptionName() const
#define TLOG_IS_ON(level)
BOOL_PARAM_FLAG(fontconfig_refresh_cache, false,"Does a one-time deletion of cache files from the ""fontconfig_tmpdir before initializing fontconfig.")
STRING_PARAM_FLAG(fonts_dir,"/auto/ocr-data/tesstraining/fonts","Overrides system default font location")
static const_iterator begin(const char *utf8_str, const int byte_length)
bool ParseFontDescriptionName(const string &name)
static void InitFontConfig(bool force_clear, const string &fonts_dir)
static void GetAllRenderableCharacters(vector< bool > *unichar_bitmap)
static int FontScore(const unordered_map< char32, inT64 > &ch_map, const string &fontname, int *raw_score, vector< bool > *ch_flags)
bool CanRenderString(const char *utf8_word, int len, vector< string > *graphemes) const
bool IsInterchangeValid(const char32 ch)
static const vector< string > & ListAvailableFonts()
static const_iterator end(const char *utf8_str, const int byte_length)
char * strcasestr(const char *haystack, const char *needle)
Locatea substring into a string, ignoring case.
bool IsWhitespace(const char32 ch)