All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
pango_font_info.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pango_font_info.cpp
3  * Description: Font-related objects and helper functions
4  * Author: Ranjith Unnikrishnan
5  * Created: Mon Nov 18 2013
6  *
7  * (C) Copyright 2013, Google Inc.
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  * http://www.apache.org/licenses/LICENSE-2.0
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  **********************************************************************/
19 
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #ifdef MINGW
26 // workaround for stdlib.h and putenv
27 #undef __STRICT_ANSI__
28 #include "strcasestr.h"
29 #endif // MINGW
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <sys/param.h>
34 #include <algorithm>
35 
36 #include "pango_font_info.h"
37 #include "commandlineflags.h"
38 #include "fileio.h"
39 #include "normstrngs.h"
40 #include "tlog.h"
41 #include "unichar.h"
42 #include "util.h"
43 #include "pango/pango.h"
44 #include "pango/pangocairo.h"
45 #include "pango/pangofc-font.h"
46 
47 STRING_PARAM_FLAG(fonts_dir, "/auto/ocr-data/tesstraining/fonts",
48  "Overrides system default font location");
49 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
50  "Overrides fontconfig default temporary dir");
51 BOOL_PARAM_FLAG(fontconfig_refresh_cache, false,
52  "Does a one-time deletion of cache files from the "
53  "fontconfig_tmpdir before initializing fontconfig.");
54 BOOL_PARAM_FLAG(fontconfig_refresh_config_file, true,
55  "Does a one-time reset of the fontconfig config file to point"
56  " to fonts_dir before initializing fontconfig. Set to true"
57  " if fontconfig_refresh_cache is true. Set it to false to use"
58  " multiple instances in separate processes without having to"
59  " rescan the fonts_dir, using a previously setup font cache");
60 
61 #ifndef USE_STD_NAMESPACE
62 #include "ocr/trainingdata/typesetting/legacy_fonts.h"
63 BOOL_PARAM_FLAG(use_only_legacy_fonts, false,
64  "Overrides --fonts_dir and sets the known universe of fonts to"
65  "the list in legacy_fonts.h");
66 #else
67 using std::pair;
68 #endif
69 
70 namespace tesseract {
71 
72 // Default assumed output resolution. Required only for providing font metrics
73 // in pixels.
74 const int kDefaultResolution = 300;
75 
76 bool PangoFontInfo::fontconfig_initialized_ = false;
77 
78 PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) {
79  Clear();
80 }
81 
82 PangoFontInfo::PangoFontInfo(const string& desc)
83  : desc_(NULL), resolution_(kDefaultResolution) {
84  if (!ParseFontDescriptionName(desc)) {
85  tprintf("ERROR: Could not parse %s\n", desc.c_str());
86  Clear();
87  }
88 }
89 
90 void PangoFontInfo::Clear() {
91  font_size_ = 0;
92  is_bold_ = false;
93  is_italic_ = false;
94  is_smallcaps_ = false;
95  is_monospace_ = false;
96  family_name_.clear();
97  font_type_ = UNKNOWN;
98  if (desc_) {
99  pango_font_description_free(desc_);
100  desc_ = NULL;
101  }
102 }
103 
105  if (!desc_) return "";
106  char* desc_str = pango_font_description_to_string(desc_);
107  string desc_name(desc_str);
108  g_free(desc_str);
109  return desc_name;
110 }
111 
112 // Initializes Fontconfig for use by writing a fake fonts.conf file into the
113 // FLAGS_fontconfigs_tmpdir directory, that points to the supplied
114 // fonts_dir, and then overrides the FONTCONFIG_PATH environment variable
115 // to point to this fonts.conf file. If force_clear, the cache is refreshed
116 // even if it has already been initialized.
117 void PangoFontInfo::InitFontConfig(bool force_clear, const string& fonts_dir) {
118  if ((fontconfig_initialized_ && !force_clear) || fonts_dir.empty()) {
119  fontconfig_initialized_ = true;
120  return;
121  }
122  if (FLAGS_fontconfig_refresh_cache || force_clear) {
124  FLAGS_fontconfig_tmpdir.c_str(), "*cache-?").c_str());
125  }
126  if (FLAGS_fontconfig_refresh_config_file || FLAGS_fontconfig_refresh_cache ||
127  force_clear) {
128  const int MAX_FONTCONF_FILESIZE = 1024;
129  char fonts_conf_template[MAX_FONTCONF_FILESIZE];
130  snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
131  "<?xml version=\"1.0\"?>\n"
132  "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
133  "<fontconfig>\n"
134  "<dir>%s</dir>\n"
135  "<cachedir>%s</cachedir>\n"
136  "<config></config>\n"
137  "</fontconfig>", fonts_dir.c_str(),
138  FLAGS_fontconfig_tmpdir.c_str());
139  string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
140  "fonts.conf");
141  File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
142  }
143 #ifdef _WIN32
144  std::string env("FONTCONFIG_PATH=");
145  env.append(FLAGS_fontconfig_tmpdir.c_str());
146  putenv(env.c_str());
147  putenv("LANG=en_US.utf8");
148 #else
149  setenv("FONTCONFIG_PATH", FLAGS_fontconfig_tmpdir.c_str(), true);
150  // Fix the locale so that the reported font names are consistent.
151  setenv("LANG", "en_US.utf8", true);
152 #endif // _WIN32
153  if (!fontconfig_initialized_ || force_clear) {
154  if (FcInitReinitialize() != FcTrue) {
155  tprintf("FcInitiReinitialize failed!!\n");
156  }
157  }
158  fontconfig_initialized_ = true;
160 }
161 
162 static void ListFontFamilies(PangoFontFamily*** families,
163  int* n_families) {
164  PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str());
165  PangoFontMap* font_map = pango_cairo_font_map_get_default();
167  pango_font_map_list_families(font_map, families, n_families);
168 }
169 
170 // Inspects whether a given font family is monospace. If the font is not
171 // available, it cannot make a decision and returns false by default.
172 static bool IsMonospaceFontFamily(const char* family_name) {
173  PangoFontFamily** families = 0;
174  int n_families = 0;
175  bool is_monospace = false;
176  ListFontFamilies(&families, &n_families);
177  ASSERT_HOST(n_families > 0);
178  bool found = false;
179  for (int i = 0; i < n_families; ++i) {
180  if (!strcasecmp(family_name, pango_font_family_get_name(families[i]))) {
181  is_monospace = pango_font_family_is_monospace(families[i]);
182  found = true;
183  break;
184  }
185  }
186  if (!found) {
187  tlog(1, "Could not find monospace property of family %s\n", family_name);
188  }
189  g_free(families);
190  return is_monospace;
191 }
192 
193 bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) {
194  Clear();
195  const char* family = pango_font_description_get_family(desc);
196  if (!family) {
197  char* desc_str = pango_font_description_to_string(desc);
198  tprintf("WARNING: Could not parse family name from description: '%s'\n",
199  desc_str);
200  g_free(desc_str);
201  return false;
202  }
203  family_name_ = string(family);
204  desc_ = pango_font_description_copy(desc);
205  is_monospace_ = IsMonospaceFontFamily(family);
206 
207  // Set font size in points
208  font_size_ = pango_font_description_get_size(desc);
209  if (!pango_font_description_get_size_is_absolute(desc)) {
210  font_size_ /= PANGO_SCALE;
211  }
212 
213  PangoStyle style = pango_font_description_get_style(desc);
214  is_italic_ = (PANGO_STYLE_ITALIC == style ||
215  PANGO_STYLE_OBLIQUE == style);
216  is_smallcaps_ = (pango_font_description_get_variant(desc)
217  == PANGO_VARIANT_SMALL_CAPS);
218 
219  is_bold_ = (pango_font_description_get_weight(desc) >= PANGO_WEIGHT_BOLD);
220  // We dont have a way to detect whether a font is of type Fraktur. The fonts
221  // we currently use all have "Fraktur" in their family name, so we do a
222  // fragile but functional check for that here.
223  is_fraktur_ = (strcasestr(family, "Fraktur") != NULL);
224  return true;
225 }
226 
228  PangoFontDescription *desc = pango_font_description_from_string(name.c_str());
229  bool success = ParseFontDescription(desc);
230  pango_font_description_free(desc);
231  return success;
232 }
233 
234 // Returns the PangoFont structure corresponding to the closest available font
235 // in the font map. Note that if the font is wholly missing, this could
236 // correspond to a completely different font family and face.
237 PangoFont* PangoFontInfo::ToPangoFont() const {
238  InitFontConfig(false, FLAGS_fonts_dir.c_str());
239  PangoFontMap* font_map = pango_cairo_font_map_get_default();
240  PangoContext* context = pango_context_new();
241  pango_cairo_context_set_resolution(context, resolution_);
242  pango_context_set_font_map(context, font_map);
243  PangoFont* font = NULL;
244  {
246  font = pango_font_map_load_font(font_map, context, desc_);
247  }
248  g_object_unref(context);
249  return font;
250 }
251 
252 bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const {
253  PangoFont* font = ToPangoFont();
254  PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
255  for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length);
256  it != UNICHAR::end(utf8_text, byte_length);
257  ++it) {
258  if (IsWhitespace(*it) || pango_is_zero_width(*it))
259  continue;
260  if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
261  char tmp[5];
262  int len = it.get_utf8(tmp);
263  tmp[len] = '\0';
264  tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
265  return false;
266  }
267  }
268  return true;
269 }
270 
271 // This variant of strncpy permits src and dest to overlap. It will copy the
272 // first byte first.
273 static char* my_strnmove(char* dest, const char* src, size_t n) {
274  char* ret = dest;
275 
276  // Copy characters until n reaches zero or the src byte is a nul.
277  do {
278  *dest = *src;
279  --n;
280  ++dest;
281  ++src;
282  } while (n && src[0]);
283 
284  // If we reached a nul byte and there are more 'n' left, zero them out.
285  while (n) {
286  *dest = '\0';
287  --n;
288  ++dest;
289  }
290  return ret;
291 }
292 
293 int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
294  PangoFont* font = ToPangoFont();
295  PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
296  int num_dropped_chars = 0;
297  // Maintain two iterators that point into the string. For space efficiency, we
298  // will repeatedly copy one covered UTF8 character from one to the other, and
299  // at the end resize the string to the right length.
300  char* out = const_cast<char*>(utf8_text->c_str());
301  const UNICHAR::const_iterator it_begin =
302  UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
303  const UNICHAR::const_iterator it_end =
304  UNICHAR::end(utf8_text->c_str(), utf8_text->length());
305  for (UNICHAR::const_iterator it = it_begin; it != it_end;) {
306  // Skip bad utf-8.
307  if (!it.is_legal()) {
308  ++it; // One suitable error message will still be issued.
309  continue;
310  }
311  int unicode = *it;
312  int utf8_len = it.utf8_len();
313  const char* utf8_char = it.utf8_data();
314  // Move it forward before the data gets modified.
315  ++it;
316  if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&
317  pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {
318  if (TLOG_IS_ON(2)) {
319  UNICHAR unichar(unicode);
320  char* str = unichar.utf8_str();
321  tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode);
322  delete[] str;
323  }
324  ++num_dropped_chars;
325  continue;
326  }
327  my_strnmove(out, utf8_char, utf8_len);
328  out += utf8_len;
329  }
330  utf8_text->resize(out - utf8_text->c_str());
331  return num_dropped_chars;
332 }
333 
334 bool PangoFontInfo::GetSpacingProperties(const string& utf8_char,
335  int* x_bearing, int* x_advance) const {
336  // Convert to equivalent PangoFont structure
337  PangoFont* font = ToPangoFont();
338  // Find the glyph index in the font for the supplied utf8 character.
339  int total_advance = 0;
340  int min_bearing = 0;
341  // Handle multi-unicode strings by reporting the left-most position of the
342  // x-bearing, and right-most position of the x-advance if the string were to
343  // be rendered.
344  const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_char.c_str(),
345  utf8_char.length());
346  const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_char.c_str(),
347  utf8_char.length());
348  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
349  PangoGlyph glyph_index = pango_fc_font_get_glyph(
350  reinterpret_cast<PangoFcFont*>(font), *it);
351  if (!glyph_index) {
352  // Glyph for given unicode character doesn't exist in font.
353  return false;
354  }
355  // Find the ink glyph extents for the glyph
356  PangoRectangle ink_rect, logical_rect;
357  pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect);
358  pango_extents_to_pixels(&ink_rect, NULL);
359  pango_extents_to_pixels(&logical_rect, NULL);
360 
361  int bearing = total_advance + PANGO_LBEARING(ink_rect);
362  if (it == it_begin || bearing < min_bearing) {
363  min_bearing = bearing;
364  }
365  total_advance += PANGO_RBEARING(logical_rect);
366  }
367  *x_bearing = min_bearing;
368  *x_advance = total_advance;
369  return true;
370 }
371 
372 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len) const {
373  vector<string> graphemes;
374  return CanRenderString(utf8_word, len, &graphemes);
375 }
376 
377 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
378  vector<string>* graphemes) const {
379  if (graphemes) graphemes->clear();
380  // We check for font coverage of the text first, as otherwise Pango could
381  // (undesirably) fall back to another font that does have the required
382  // coverage.
383  if (!CoversUTF8Text(utf8_word, len)) {
384  return false;
385  }
386  // U+25CC dotted circle character that often (but not always) gets rendered
387  // when there is an illegal grapheme sequence.
388  const char32 kDottedCircleGlyph = 9676;
389  bool bad_glyph = false;
390  PangoFontMap* font_map = pango_cairo_font_map_get_default();
391  PangoContext* context = pango_context_new();
392  pango_context_set_font_map(context, font_map);
393  PangoLayout* layout;
394  {
395  // Pango is not relasing the cached layout.
397  layout = pango_layout_new(context);
398  }
399  if (desc_) {
400  pango_layout_set_font_description(layout, desc_);
401  } else {
402  PangoFontDescription *desc = pango_font_description_from_string(
403  DescriptionName().c_str());
404  pango_layout_set_font_description(layout, desc);
405  pango_font_description_free(desc);
406  }
407  pango_layout_set_text(layout, utf8_word, len);
408  PangoLayoutIter* run_iter = NULL;
409  { // Fontconfig caches some information here that is not freed before exit.
411  run_iter = pango_layout_get_iter(layout);
412  }
413  do {
414  PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
415  if (!run) {
416  tlog(2, "Found end of line NULL run marker\n");
417  continue;
418  }
419  PangoGlyph dotted_circle_glyph;
420  PangoFont* font = run->item->analysis.font;
421  dotted_circle_glyph = pango_fc_font_get_glyph(
422  reinterpret_cast<PangoFcFont*>(font), kDottedCircleGlyph);
423  if (TLOG_IS_ON(2)) {
424  PangoFontDescription* desc = pango_font_describe(font);
425  char* desc_str = pango_font_description_to_string(desc);
426  tlog(2, "Desc of font in run: %s\n", desc_str);
427  g_free(desc_str);
428  pango_font_description_free(desc);
429  }
430 
431  PangoGlyphItemIter cluster_iter;
432  gboolean have_cluster;
433  for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
434  run, utf8_word);
435  have_cluster && !bad_glyph;
436  have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
437  const int start_byte_index = cluster_iter.start_index;
438  const int end_byte_index = cluster_iter.end_index;
439  int start_glyph_index = cluster_iter.start_glyph;
440  int end_glyph_index = cluster_iter.end_glyph;
441  string cluster_text = string(utf8_word + start_byte_index,
442  end_byte_index - start_byte_index);
443  if (graphemes) graphemes->push_back(cluster_text);
444  if (IsUTF8Whitespace(cluster_text.c_str())) {
445  tlog(2, "Skipping whitespace\n");
446  continue;
447  }
448  if (TLOG_IS_ON(2)) {
449  printf("start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d ",
450  start_byte_index, end_byte_index,
451  start_glyph_index, end_glyph_index);
452  }
453  for (int i = start_glyph_index,
454  step = (end_glyph_index > start_glyph_index) ? 1 : -1;
455  !bad_glyph && i != end_glyph_index; i+= step) {
456  const bool unknown_glyph =
457  (cluster_iter.glyph_item->glyphs->glyphs[i].glyph &
458  PANGO_GLYPH_UNKNOWN_FLAG);
459  const bool illegal_glyph =
460  (cluster_iter.glyph_item->glyphs->glyphs[i].glyph ==
461  dotted_circle_glyph);
462  bad_glyph = unknown_glyph || illegal_glyph;
463  if (TLOG_IS_ON(2)) {
464  printf("(%d=%d)", cluster_iter.glyph_item->glyphs->glyphs[i].glyph,
465  bad_glyph ? 1 : 0);
466  }
467  }
468  if (TLOG_IS_ON(2)) {
469  printf(" '%s'\n", cluster_text.c_str());
470  }
471  if (bad_glyph)
472  tlog(1, "Found illegal glyph!\n");
473  }
474  } while (!bad_glyph && pango_layout_iter_next_run(run_iter));
475 
476  pango_layout_iter_free(run_iter);
477  g_object_unref(context);
478  g_object_unref(layout);
479  if (bad_glyph && graphemes) graphemes->clear();
480  return !bad_glyph;
481 }
482 
483 
484 // ------------------------ FontUtils ------------------------------------
485 vector<string> FontUtils::available_fonts_; // cache list
486 
487 // Returns whether the specified font description is available in the fonts
488 // directory.
489 //
490 // The generated list of font families and faces includes "synthesized" font
491 // faces that are not truly loadable. Pango versions >=1.18 have a
492 // pango_font_face_is_synthesized method that can be used to prune the list.
493 // Until then, we are restricted to using a hack where we try to load the font
494 // from the font_map, and then check what we loaded to see if it has the
495 // description we expected. If it is not, then the font is deemed unavailable.
496 /* static */
497 bool FontUtils::IsAvailableFont(const char* input_query_desc,
498  string* best_match) {
499  string query_desc(input_query_desc);
500  if (PANGO_VERSION <= 12005) {
501  // Strip commas and any ' Medium' substring in the name.
502  query_desc.erase(std::remove(query_desc.begin(), query_desc.end(), ','),
503  query_desc.end());
504  const string kMediumStr = " Medium";
505  std::size_t found = query_desc.find(kMediumStr);
506  if (found != std::string::npos) {
507  query_desc.erase(found, kMediumStr.length());
508  }
509  }
510 
511  PangoFontDescription *desc = pango_font_description_from_string(
512  query_desc.c_str());
513  PangoFont* selected_font = NULL;
514  {
515  PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str());
516  PangoFontMap* font_map = pango_cairo_font_map_get_default();
517  PangoContext* context = pango_context_new();
518  pango_context_set_font_map(context, font_map);
519  {
521  selected_font = pango_font_map_load_font(font_map, context, desc);
522  }
523  g_object_unref(context);
524  }
525  if (selected_font == NULL) {
526  pango_font_description_free(desc);
527  return false;
528  }
529  PangoFontDescription* selected_desc = pango_font_describe(selected_font);
530 
531  bool equal = pango_font_description_equal(desc, selected_desc);
532  tlog(3, "query weight = %d \t selected weight =%d\n",
533  pango_font_description_get_weight(desc),
534  pango_font_description_get_weight(selected_desc));
535 
536  char* selected_desc_str = pango_font_description_to_string(selected_desc);
537  tlog(2, "query_desc: '%s' Selected: 's'\n", query_desc.c_str(),
538  selected_desc_str);
539  if (!equal && best_match != NULL) {
540  *best_match = selected_desc_str;
541  // Clip the ending ' 0' if there is one. It seems that, if there is no
542  // point size on the end of the fontname, then Pango always appends ' 0'.
543  int len = best_match->size();
544  if (len > 2 && best_match->at(len - 1) == '0' &&
545  best_match->at(len - 2) == ' ') {
546  *best_match = best_match->substr(0, len - 2);
547  }
548  }
549  g_free(selected_desc_str);
550  pango_font_description_free(selected_desc);
551  g_object_unref(selected_font);
552  pango_font_description_free(desc);
553  return equal;
554 }
555 
556 static bool ShouldIgnoreFontFamilyName(const char* query) {
557  static const char* kIgnoredFamilyNames[]
558  = { "Sans", "Serif", "Monospace", NULL };
559  const char** list = kIgnoredFamilyNames;
560  for (; *list != NULL; ++list) {
561  if (!strcmp(*list, query))
562  return true;
563  }
564  return false;
565 }
566 
567 // Outputs description names of available fonts.
568 /* static */
569 const vector<string>& FontUtils::ListAvailableFonts() {
570  if (available_fonts_.size()) {
571  return available_fonts_;
572  }
573 #ifndef USE_STD_NAMESPACE
574  if (FLAGS_use_only_legacy_fonts) {
575  // Restrict view to list of fonts in legacy_fonts.h
576  tprintf("Using list of legacy fonts only\n");
577  const int kNumFontLists = 4;
578  for (int i = 0; i < kNumFontLists; ++i) {
579  for (int j = 0; kFontlists[i][j] != NULL; ++j) {
580  available_fonts_.push_back(kFontlists[i][j]);
581  }
582  }
583  return available_fonts_;
584  }
585 #endif
586 
587  PangoFontFamily** families = 0;
588  int n_families = 0;
589  ListFontFamilies(&families, &n_families);
590  for (int i = 0; i < n_families; ++i) {
591  const char* family_name = pango_font_family_get_name(families[i]);
592  tlog(2, "Listing family %s\n", family_name);
593  if (ShouldIgnoreFontFamilyName(family_name)) {
594  continue;
595  }
596 
597  int n_faces;
598  PangoFontFace** faces = NULL;
599  pango_font_family_list_faces(families[i], &faces, &n_faces);
600  for (int j = 0; j < n_faces; ++j) {
601  PangoFontDescription* desc = pango_font_face_describe(faces[j]);
602  char* desc_str = pango_font_description_to_string(desc);
603  if (IsAvailableFont(desc_str)) {
604  available_fonts_.push_back(desc_str);
605  }
606  pango_font_description_free(desc);
607  g_free(desc_str);
608  }
609  g_free(faces);
610  }
611  g_free(families);
612  sort(available_fonts_.begin(), available_fonts_.end());
613  return available_fonts_;
614 }
615 
616 
617 static void CharCoverageMapToBitmap(PangoCoverage* coverage,
618  vector<bool>* unichar_bitmap) {
619  const int kMinUnicodeValue = 33;
620  const int kMaxUnicodeValue = 0x10FFFF;
621  unichar_bitmap->resize(kMaxUnicodeValue + 1, false);
622  // Mark off characters that the font can render.
623  for (int i = kMinUnicodeValue; i <= kMaxUnicodeValue; ++i) {
624  if (IsInterchangeValid(i)) {
625  (*unichar_bitmap)[i]
626  = (pango_coverage_get(coverage, i) == PANGO_COVERAGE_EXACT);
627  }
628  }
629 }
630 
631 /* static */
632 void FontUtils::GetAllRenderableCharacters(vector<bool>* unichar_bitmap) {
633  const vector<string>& all_fonts = ListAvailableFonts();
634  return GetAllRenderableCharacters(all_fonts, unichar_bitmap);
635 }
636 
637 /* static */
638 void FontUtils::GetAllRenderableCharacters(const string& font_name,
639  vector<bool>* unichar_bitmap) {
640  PangoFontInfo font_info(font_name);
641  PangoCoverage* coverage = pango_font_get_coverage(
642  font_info.ToPangoFont(), NULL);
643  CharCoverageMapToBitmap(coverage, unichar_bitmap);
644 }
645 
646 /* static */
647 void FontUtils::GetAllRenderableCharacters(const vector<string>& fonts,
648  vector<bool>* unichar_bitmap) {
649  // Form the union of coverage maps from the fonts
650  PangoCoverage* all_coverage = pango_coverage_new();
651  tlog(1, "Processing %d fonts\n", fonts.size());
652  for (int i = 0; i < fonts.size(); ++i) {
653  PangoFontInfo font_info(fonts[i]);
654  PangoCoverage* coverage = pango_font_get_coverage(
655  font_info.ToPangoFont(), NULL);
656  // Mark off characters that any font can render.
657  pango_coverage_max(all_coverage, coverage);
658  }
659  CharCoverageMapToBitmap(all_coverage, unichar_bitmap);
660  pango_coverage_unref(all_coverage);
661 }
662 
663 
664 // Utilities written to be backward compatible with StringRender
665 
666 /* static */
667 int FontUtils::FontScore(const unordered_map<char32, inT64>& ch_map,
668  const string& fontname,
669  int* raw_score,
670  vector<bool>* ch_flags) {
671  PangoFontInfo font_info;
672  if (!font_info.ParseFontDescriptionName(fontname)) {
673  tprintf("ERROR: Could not parse %s\n", fontname.c_str());
674  }
675  PangoFont* font = font_info.ToPangoFont();
676  PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
677 
678  if (ch_flags) {
679  ch_flags->clear();
680  ch_flags->reserve(ch_map.size());
681  }
682  *raw_score = 0;
683  int ok_chars = 0;
684  for (unordered_map<char32, inT64>::const_iterator it = ch_map.begin();
685  it != ch_map.end(); ++it) {
686  bool covered = (IsWhitespace(it->first) ||
687  (pango_coverage_get(coverage, it->first)
688  == PANGO_COVERAGE_EXACT));
689  if (covered) {
690  ++(*raw_score);
691  ok_chars += it->second;
692  }
693  if (ch_flags) {
694  ch_flags->push_back(covered);
695  }
696  }
697  return ok_chars;
698 }
699 
700 
701 /* static */
702 string FontUtils::BestFonts(const unordered_map<char32, inT64>& ch_map,
703  vector<pair<const char*, vector<bool> > >* fonts) {
704  const double kMinOKFraction = 0.99;
705  // Weighted fraction of characters that must be renderable in a font to make
706  // it OK even if the raw count is not good.
707  const double kMinWeightedFraction = 0.99995;
708 
709  fonts->clear();
710  vector<vector<bool> > font_flags;
711  vector<int> font_scores;
712  vector<int> raw_scores;
713  int most_ok_chars = 0;
714  int best_raw_score = 0;
715  const vector<string>& font_names = FontUtils::ListAvailableFonts();
716  for (int i = 0; i < font_names.size(); ++i) {
717  vector<bool> ch_flags;
718  int raw_score = 0;
719  int ok_chars = FontScore(ch_map, font_names[i], &raw_score, &ch_flags);
720  most_ok_chars = MAX(ok_chars, most_ok_chars);
721  best_raw_score = MAX(raw_score, best_raw_score);
722 
723  font_flags.push_back(ch_flags);
724  font_scores.push_back(ok_chars);
725  raw_scores.push_back(raw_score);
726  }
727 
728  // Now select the fonts with a score above a threshold fraction
729  // of both the raw and weighted best scores. To prevent bogus fonts being
730  // selected for CJK, we require a high fraction (kMinOKFraction = 0.99) of
731  // BOTH weighted and raw scores.
732  // In low character-count scripts, the issue is more getting enough fonts,
733  // when only 1 or 2 might have all those rare dingbats etc in them, so we
734  // allow a font with a very high weighted (coverage) score
735  // (kMinWeightedFraction = 0.99995) to be used even if its raw score is poor.
736  int least_good_enough = static_cast<int>(most_ok_chars * kMinOKFraction);
737  int least_raw_enough = static_cast<int>(best_raw_score * kMinOKFraction);
738  int override_enough = static_cast<int>(most_ok_chars * kMinWeightedFraction);
739 
740  string font_list;
741  for (int i = 0; i < font_names.size(); ++i) {
742  int score = font_scores[i];
743  int raw_score = raw_scores[i];
744  if ((score >= least_good_enough && raw_score >= least_raw_enough) ||
745  score >= override_enough) {
746  fonts->push_back(make_pair(font_names[i].c_str(), font_flags[i]));
747  tlog(1, "OK font %s = %.4f%%, raw = %d = %.2f%%\n",
748  font_names[i].c_str(),
749  100.0 * score / most_ok_chars,
750  raw_score, 100.0 * raw_score / best_raw_score);
751  font_list += font_names[i];
752  font_list += "\n";
753  } else if (score >= least_good_enough || raw_score >= least_raw_enough) {
754  tlog(1, "Runner-up font %s = %.4f%%, raw = %d = %.2f%%\n",
755  font_names[i].c_str(),
756  100.0 * score / most_ok_chars,
757  raw_score, 100.0 * raw_score / best_raw_score);
758  }
759  }
760  return font_list;
761 }
762 
763 /* static */
764 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
765  string* font_name, vector<string>* graphemes) {
766  return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name,
767  graphemes);
768 }
769 
770 /* static */
771 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
772  const vector<string>& all_fonts,
773  string* font_name, vector<string>* graphemes) {
774  if (font_name) font_name->clear();
775  if (graphemes) graphemes->clear();
776  for (int i = 0; i < all_fonts.size(); ++i) {
777  PangoFontInfo font;
778  vector<string> found_graphemes;
779  ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_fonts[i]),
780  "Could not parse font desc name %s\n",
781  all_fonts[i].c_str());
782  if (font.CanRenderString(utf8_word, utf8_len, &found_graphemes)) {
783  if (graphemes) graphemes->swap(found_graphemes);
784  if (font_name) *font_name = all_fonts[i];
785  return true;
786  }
787  }
788  return false;
789 }
790 
791 // PangoFontInfo is reinitialized, so clear the static list of fonts.
792 /* static */
793 void FontUtils::ReInit() { available_fonts_.clear(); }
794 
795 } // namespace tesseract
static string BestFonts(const unordered_map< char32, inT64 > &ch_map, vector< std::pair< const char *, vector< bool > > > *font_flag)
#define DISABLE_HEAP_LEAK_CHECK
Definition: util.h:63
#define MAX(x, y)
Definition: ndminx.h:24
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:182
bool GetSpacingProperties(const string &utf8_char, int *x_bearing, int *x_advance) const
#define tprintf(...)
Definition: tprintf.h:31
static bool SelectFont(const char *utf8_word, const int utf8_len, string *font_name, vector< string > *graphemes)
static string JoinPath(const string &prefix, const string &suffix)
Definition: fileio.cpp:89
static bool IsAvailableFont(const char *font_desc)
static void WriteStringToFileOrDie(const string &str, const string &filename)
Definition: fileio.cpp:53
int DropUncoveredChars(string *utf8_text) const
#define ASSERT_HOST_MSG(x, msg...)
Definition: errcode.h:98
#define ASSERT_HOST(x)
Definition: errcode.h:84
static bool DeleteMatchingFiles(const char *pattern)
Definition: fileio.cpp:118
const int kDefaultResolution
Default resolution used if input in not believable.
Definition: pagesegmain.cpp:60
string DescriptionName() const
#define TLOG_IS_ON(level)
Definition: tlog.h:39
BOOL_PARAM_FLAG(fontconfig_refresh_cache, false,"Does a one-time deletion of cache files from the ""fontconfig_tmpdir before initializing fontconfig.")
STRING_PARAM_FLAG(fonts_dir,"/auto/ocr-data/tesstraining/fonts","Overrides system default font location")
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:200
bool ParseFontDescriptionName(const string &name)
name_table name
char * utf8_str() const
Definition: unichar.cpp:125
signed int char32
Definition: normstrngs.h:27
static void InitFontConfig(bool force_clear, const string &fonts_dir)
static void GetAllRenderableCharacters(vector< bool > *unichar_bitmap)
static int FontScore(const unordered_map< char32, inT64 > &ch_map, const string &fontname, int *raw_score, vector< bool > *ch_flags)
#define tlog(level,...)
Definition: tlog.h:33
bool CanRenderString(const char *utf8_word, int len, vector< string > *graphemes) const
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:208
#define NULL
Definition: host.h:144
static const vector< string > & ListAvailableFonts()
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:204
char * strcasestr(const char *haystack, const char *needle)
Locatea substring into a string, ignoring case.
Definition: strcasestr.cpp:43
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:176
int utf8_len() const
Definition: unichar.cpp:186