tesseract  4.0.0-1-g2a2b
pango_font_info.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pango_font_info.cpp
3  * Description: Font-related objects and helper functions
4  * Author: Ranjith Unnikrishnan
5  * Created: Mon Nov 18 2013
6  *
7  * (C) Copyright 2013, Google Inc.
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  * http://www.apache.org/licenses/LICENSE-2.0
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  **********************************************************************/
19 
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #if (defined __MINGW32__) || (defined __CYGWIN__)
26 // workaround for stdlib.h and putenv
27 #undef __STRICT_ANSI__
28 #endif
29 
30 #include <cstdlib>
31 #include <cstdio>
32 #include <cstring>
33 #ifndef _MSC_VER
34 #include <sys/param.h>
35 #endif
36 #include <algorithm>
37 
38 #include "pango_font_info.h"
39 #include "commandlineflags.h"
40 #include "fileio.h"
41 #include "normstrngs.h"
42 #include "tlog.h"
43 #include "unichar.h"
44 #include "util.h"
45 #include "pango/pango.h"
46 #include "pango/pangocairo.h"
47 #include "pango/pangofc-font.h"
48 
49 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
50  "Overrides fontconfig default temporary dir");
51 
52 #ifdef GOOGLE_TESSERACT
53 #include "ocr/trainingdata/typesetting/legacy_fonts.h"
54 BOOL_PARAM_FLAG(use_only_legacy_fonts, false,
55  "Overrides --fonts_dir and sets the known universe of fonts to"
56  "the list in legacy_fonts.h");
57 
58 STRING_PARAM_FLAG(fonts_dir, "/auto/ocr-data/tesstraining/fonts",
59  "Overrides system default font location");
60 #else
61 using std::pair;
62 STRING_PARAM_FLAG(fonts_dir, "",
63  "If empty it use system default. Otherwise it overrides"
64  " system default font location");
65 #endif
66 
67 namespace tesseract {
68 
69 // Default assumed output resolution. Required only for providing font metrics
70 // in pixels.
71 const int kDefaultResolution = 300;
72 
73 std::string PangoFontInfo::fonts_dir_;
74 std::string PangoFontInfo::cache_dir_;
75 
77  : desc_(nullptr), resolution_(kDefaultResolution) {
78  Clear();
79 }
80 
81 PangoFontInfo::PangoFontInfo(const std::string& desc)
82  : desc_(nullptr), resolution_(kDefaultResolution) {
83  if (!ParseFontDescriptionName(desc)) {
84  tprintf("ERROR: Could not parse %s\n", desc.c_str());
85  Clear();
86  }
87 }
88 
89 void PangoFontInfo::Clear() {
90  font_size_ = 0;
91  family_name_.clear();
92  font_type_ = UNKNOWN;
93  if (desc_) {
94  pango_font_description_free(desc_);
95  desc_ = nullptr;
96  }
97 }
98 
99 PangoFontInfo::~PangoFontInfo() { pango_font_description_free(desc_); }
100 
101 std::string PangoFontInfo::DescriptionName() const {
102  if (!desc_) return "";
103  char* desc_str = pango_font_description_to_string(desc_);
104  std::string desc_name(desc_str);
105  g_free(desc_str);
106  return desc_name;
107 }
108 
109 // If not already initialized, initializes FontConfig by setting its
110 // environment variable and creating a fonts.conf file that points to the
111 // FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir.
112 /* static */
114  if (fonts_dir_.empty()) {
115  HardInitFontConfig(FLAGS_fonts_dir.c_str(),
116  FLAGS_fontconfig_tmpdir.c_str());
117  }
118 }
119 
120 // Re-initializes font config, whether or not already initialized.
121 // If already initialized, any existing cache is deleted, just to be sure.
122 /* static */
123 void PangoFontInfo::HardInitFontConfig(const std::string& fonts_dir,
124  const std::string& cache_dir) {
125  if (!cache_dir_.empty()) {
127  File::JoinPath(cache_dir_.c_str(), "*cache-?").c_str());
128  }
129  const int MAX_FONTCONF_FILESIZE = 1024;
130  char fonts_conf_template[MAX_FONTCONF_FILESIZE];
131  cache_dir_ = cache_dir;
132  fonts_dir_ = fonts_dir;
133  snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
134  "<?xml version=\"1.0\"?>\n"
135  "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
136  "<fontconfig>\n"
137  "<dir>%s</dir>\n"
138  "<cachedir>%s</cachedir>\n"
139  "<config></config>\n"
140  "</fontconfig>",
141  fonts_dir.c_str(), cache_dir_.c_str());
142  std::string fonts_conf_file = File::JoinPath(cache_dir_.c_str(), "fonts.conf");
143  File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
144 #ifdef _WIN32
145  std::string env("FONTCONFIG_PATH=");
146  env.append(cache_dir_.c_str());
147  _putenv(env.c_str());
148  _putenv("LANG=en_US.utf8");
149 #else
150  setenv("FONTCONFIG_PATH", cache_dir_.c_str(), true);
151  // Fix the locale so that the reported font names are consistent.
152  setenv("LANG", "en_US.utf8", true);
153 #endif // _WIN32
154 
155  if (FcInitReinitialize() != FcTrue) {
156  tprintf("FcInitiReinitialize failed!!\n");
157  }
159  // Clear Pango's font cache too.
160  pango_cairo_font_map_set_default(nullptr);
161 }
162 
163 static void ListFontFamilies(PangoFontFamily*** families,
164  int* n_families) {
166  PangoFontMap* font_map = pango_cairo_font_map_get_default();
168  pango_font_map_list_families(font_map, families, n_families);
169 }
170 
171 bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) {
172  Clear();
173  const char* family = pango_font_description_get_family(desc);
174  if (!family) {
175  char* desc_str = pango_font_description_to_string(desc);
176  tprintf("WARNING: Could not parse family name from description: '%s'\n",
177  desc_str);
178  g_free(desc_str);
179  return false;
180  }
181  family_name_ = std::string(family);
182  desc_ = pango_font_description_copy(desc);
183 
184  // Set font size in points
185  font_size_ = pango_font_description_get_size(desc);
186  if (!pango_font_description_get_size_is_absolute(desc)) {
187  font_size_ /= PANGO_SCALE;
188  }
189 
190  return true;
191 }
192 
193 bool PangoFontInfo::ParseFontDescriptionName(const std::string& name) {
194  PangoFontDescription *desc = pango_font_description_from_string(name.c_str());
195  bool success = ParseFontDescription(desc);
196  pango_font_description_free(desc);
197  return success;
198 }
199 
200 // Returns the PangoFont structure corresponding to the closest available font
201 // in the font map. Note that if the font is wholly missing, this could
202 // correspond to a completely different font family and face.
203 PangoFont* PangoFontInfo::ToPangoFont() const {
205  PangoFontMap* font_map = pango_cairo_font_map_get_default();
206  PangoContext* context = pango_context_new();
207  pango_cairo_context_set_resolution(context, resolution_);
208  pango_context_set_font_map(context, font_map);
209  PangoFont* font = nullptr;
210  {
212  font = pango_font_map_load_font(font_map, context, desc_);
213  }
214  g_object_unref(context);
215  return font;
216 }
217 
218 bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const {
219  PangoFont* font = ToPangoFont();
220  PangoCoverage* coverage = pango_font_get_coverage(font, nullptr);
221  for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length);
222  it != UNICHAR::end(utf8_text, byte_length);
223  ++it) {
224  if (IsWhitespace(*it) || pango_is_zero_width(*it))
225  continue;
226  if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
227  char tmp[5];
228  int len = it.get_utf8(tmp);
229  tmp[len] = '\0';
230  tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
231  return false;
232  }
233  }
234  pango_coverage_unref(coverage);
235  g_object_unref(font);
236  return true;
237 }
238 
239 // This variant of strncpy permits src and dest to overlap. It will copy the
240 // first byte first.
241 static char* my_strnmove(char* dest, const char* src, size_t n) {
242  char* ret = dest;
243 
244  // Copy characters until n reaches zero or the src byte is a nul.
245  do {
246  *dest = *src;
247  --n;
248  ++dest;
249  ++src;
250  } while (n && src[0]);
251 
252  // If we reached a nul byte and there are more 'n' left, zero them out.
253  while (n) {
254  *dest = '\0';
255  --n;
256  ++dest;
257  }
258  return ret;
259 }
260 
261 int PangoFontInfo::DropUncoveredChars(std::string* utf8_text) const {
262  PangoFont* font = ToPangoFont();
263  PangoCoverage* coverage = pango_font_get_coverage(font, nullptr);
264  int num_dropped_chars = 0;
265  // Maintain two iterators that point into the string. For space efficiency, we
266  // will repeatedly copy one covered UTF8 character from one to the other, and
267  // at the end resize the string to the right length.
268  char* out = const_cast<char*>(utf8_text->c_str());
269  const UNICHAR::const_iterator it_begin =
270  UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
271  const UNICHAR::const_iterator it_end =
272  UNICHAR::end(utf8_text->c_str(), utf8_text->length());
273  for (UNICHAR::const_iterator it = it_begin; it != it_end;) {
274  // Skip bad utf-8.
275  if (!it.is_legal()) {
276  ++it; // One suitable error message will still be issued.
277  continue;
278  }
279  int unicode = *it;
280  int utf8_len = it.utf8_len();
281  const char* utf8_char = it.utf8_data();
282  // Move it forward before the data gets modified.
283  ++it;
284  if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&
285  pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {
286  if (TLOG_IS_ON(2)) {
287  UNICHAR unichar(unicode);
288  char* str = unichar.utf8_str();
289  tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode);
290  delete[] str;
291  }
292  ++num_dropped_chars;
293  continue;
294  }
295  my_strnmove(out, utf8_char, utf8_len);
296  out += utf8_len;
297  }
298  pango_coverage_unref(coverage);
299  g_object_unref(font);
300  utf8_text->resize(out - utf8_text->c_str());
301  return num_dropped_chars;
302 }
303 
304 bool PangoFontInfo::GetSpacingProperties(const std::string& utf8_char,
305  int* x_bearing, int* x_advance) const {
306  // Convert to equivalent PangoFont structure
307  PangoFont* font = ToPangoFont();
308  // Find the glyph index in the font for the supplied utf8 character.
309  int total_advance = 0;
310  int min_bearing = 0;
311  // Handle multi-unicode strings by reporting the left-most position of the
312  // x-bearing, and right-most position of the x-advance if the string were to
313  // be rendered.
314  const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_char.c_str(),
315  utf8_char.length());
316  const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_char.c_str(),
317  utf8_char.length());
318  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
319  PangoGlyph glyph_index = pango_fc_font_get_glyph(
320  reinterpret_cast<PangoFcFont*>(font), *it);
321  if (!glyph_index) {
322  // Glyph for given unicode character doesn't exist in font.
323  g_object_unref(font);
324  return false;
325  }
326  // Find the ink glyph extents for the glyph
327  PangoRectangle ink_rect, logical_rect;
328  pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect);
329  pango_extents_to_pixels(&ink_rect, nullptr);
330  pango_extents_to_pixels(&logical_rect, nullptr);
331 
332  int bearing = total_advance + PANGO_LBEARING(ink_rect);
333  if (it == it_begin || bearing < min_bearing) {
334  min_bearing = bearing;
335  }
336  total_advance += PANGO_RBEARING(logical_rect);
337  }
338  *x_bearing = min_bearing;
339  *x_advance = total_advance;
340  g_object_unref(font);
341  return true;
342 }
343 
344 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len) const {
345  std::vector<std::string> graphemes;
346  return CanRenderString(utf8_word, len, &graphemes);
347 }
348 
349 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
350  std::vector<std::string>* graphemes) const {
351  if (graphemes) graphemes->clear();
352  // We check for font coverage of the text first, as otherwise Pango could
353  // (undesirably) fall back to another font that does have the required
354  // coverage.
355  if (!CoversUTF8Text(utf8_word, len)) {
356  return false;
357  }
358  // U+25CC dotted circle character that often (but not always) gets rendered
359  // when there is an illegal grapheme sequence.
360  const char32 kDottedCircleGlyph = 9676;
361  bool bad_glyph = false;
362  PangoFontMap* font_map = pango_cairo_font_map_get_default();
363  PangoContext* context = pango_context_new();
364  pango_context_set_font_map(context, font_map);
365  PangoLayout* layout;
366  {
367  // Pango is not releasing the cached layout.
369  layout = pango_layout_new(context);
370  }
371  if (desc_) {
372  pango_layout_set_font_description(layout, desc_);
373  } else {
374  PangoFontDescription *desc = pango_font_description_from_string(
375  DescriptionName().c_str());
376  pango_layout_set_font_description(layout, desc);
377  pango_font_description_free(desc);
378  }
379  pango_layout_set_text(layout, utf8_word, len);
380  PangoLayoutIter* run_iter = nullptr;
381  { // Fontconfig caches some information here that is not freed before exit.
383  run_iter = pango_layout_get_iter(layout);
384  }
385  do {
386  PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
387  if (!run) {
388  tlog(2, "Found end of line nullptr run marker\n");
389  continue;
390  }
391  PangoGlyph dotted_circle_glyph;
392  PangoFont* font = run->item->analysis.font;
393 
394 #ifdef _WIN32 // Fixme! Leaks memory and breaks unittests.
395  PangoGlyphString* glyphs = pango_glyph_string_new();
396  char s[] = "\xc2\xa7";
397  pango_shape(s, sizeof(s), &(run->item->analysis), glyphs);
398  dotted_circle_glyph = glyphs->glyphs[0].glyph;
399 #else
400  dotted_circle_glyph = pango_fc_font_get_glyph(
401  reinterpret_cast<PangoFcFont*>(font), kDottedCircleGlyph);
402 #endif
403 
404  if (TLOG_IS_ON(2)) {
405  PangoFontDescription* desc = pango_font_describe(font);
406  char* desc_str = pango_font_description_to_string(desc);
407  tlog(2, "Desc of font in run: %s\n", desc_str);
408  g_free(desc_str);
409  pango_font_description_free(desc);
410  }
411 
412  PangoGlyphItemIter cluster_iter;
413  gboolean have_cluster;
414  for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
415  run, utf8_word);
416  have_cluster && !bad_glyph;
417  have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
418  const int start_byte_index = cluster_iter.start_index;
419  const int end_byte_index = cluster_iter.end_index;
420  int start_glyph_index = cluster_iter.start_glyph;
421  int end_glyph_index = cluster_iter.end_glyph;
422  std::string cluster_text = std::string(utf8_word + start_byte_index,
423  end_byte_index - start_byte_index);
424  if (graphemes) graphemes->push_back(cluster_text);
425  if (IsUTF8Whitespace(cluster_text.c_str())) {
426  tlog(2, "Skipping whitespace\n");
427  continue;
428  }
429  if (TLOG_IS_ON(2)) {
430  printf("start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d ",
431  start_byte_index, end_byte_index,
432  start_glyph_index, end_glyph_index);
433  }
434  for (int i = start_glyph_index,
435  step = (end_glyph_index > start_glyph_index) ? 1 : -1;
436  !bad_glyph && i != end_glyph_index; i+= step) {
437  const bool unknown_glyph =
438  (cluster_iter.glyph_item->glyphs->glyphs[i].glyph &
439  PANGO_GLYPH_UNKNOWN_FLAG);
440  const bool illegal_glyph =
441  (cluster_iter.glyph_item->glyphs->glyphs[i].glyph ==
442  dotted_circle_glyph);
443  bad_glyph = unknown_glyph || illegal_glyph;
444  if (TLOG_IS_ON(2)) {
445  printf("(%d=%d)", cluster_iter.glyph_item->glyphs->glyphs[i].glyph,
446  bad_glyph ? 1 : 0);
447  }
448  }
449  if (TLOG_IS_ON(2)) {
450  printf(" '%s'\n", cluster_text.c_str());
451  }
452  if (bad_glyph)
453  tlog(1, "Found illegal glyph!\n");
454  }
455  } while (!bad_glyph && pango_layout_iter_next_run(run_iter));
456 
457  pango_layout_iter_free(run_iter);
458  g_object_unref(context);
459  g_object_unref(layout);
460  if (bad_glyph && graphemes) graphemes->clear();
461  return !bad_glyph;
462 }
463 
464 
465 // ------------------------ FontUtils ------------------------------------
466 std::vector<std::string> FontUtils::available_fonts_; // cache list
467 
468 // Returns whether the specified font description is available in the fonts
469 // directory.
470 //
471 // The generated list of font families and faces includes "synthesized" font
472 // faces that are not truly loadable. Pango versions >=1.18 have a
473 // pango_font_face_is_synthesized method that can be used to prune the list.
474 // Until then, we are restricted to using a hack where we try to load the font
475 // from the font_map, and then check what we loaded to see if it has the
476 // description we expected. If it is not, then the font is deemed unavailable.
477 /* static */
478 bool FontUtils::IsAvailableFont(const char* input_query_desc,
479  std::string* best_match) {
480  std::string query_desc(input_query_desc);
481  PangoFontDescription *desc = pango_font_description_from_string(
482  query_desc.c_str());
483  PangoFont* selected_font = nullptr;
484  {
486  PangoFontMap* font_map = pango_cairo_font_map_get_default();
487  PangoContext* context = pango_context_new();
488  pango_context_set_font_map(context, font_map);
489  {
491  selected_font = pango_font_map_load_font(font_map, context, desc);
492  }
493  g_object_unref(context);
494  }
495  if (selected_font == nullptr) {
496  pango_font_description_free(desc);
497  return false;
498  }
499  PangoFontDescription* selected_desc = pango_font_describe(selected_font);
500 
501  bool equal = pango_font_description_equal(desc, selected_desc);
502  tlog(3, "query weight = %d \t selected weight =%d\n",
503  pango_font_description_get_weight(desc),
504  pango_font_description_get_weight(selected_desc));
505 
506  char* selected_desc_str = pango_font_description_to_string(selected_desc);
507  tlog(2, "query_desc: '%s' Selected: '%s'\n", query_desc.c_str(),
508  selected_desc_str);
509  if (!equal && best_match != nullptr) {
510  *best_match = selected_desc_str;
511  // Clip the ending ' 0' if there is one. It seems that, if there is no
512  // point size on the end of the fontname, then Pango always appends ' 0'.
513  int len = best_match->size();
514  if (len > 2 && best_match->at(len - 1) == '0' &&
515  best_match->at(len - 2) == ' ') {
516  *best_match = best_match->substr(0, len - 2);
517  }
518  }
519  g_free(selected_desc_str);
520  pango_font_description_free(selected_desc);
521  g_object_unref(selected_font);
522  pango_font_description_free(desc);
523  return equal;
524 }
525 
526 static bool ShouldIgnoreFontFamilyName(const char* query) {
527  static const char* kIgnoredFamilyNames[] = {"Sans", "Serif", "Monospace",
528  nullptr};
529  const char** list = kIgnoredFamilyNames;
530  for (; *list != nullptr; ++list) {
531  if (!strcmp(*list, query))
532  return true;
533  }
534  return false;
535 }
536 
537 // Outputs description names of available fonts.
538 /* static */
539 const std::vector<std::string>& FontUtils::ListAvailableFonts() {
540  if (!available_fonts_.empty()) {
541  return available_fonts_;
542  }
543 #ifdef GOOGLE_TESSERACT
544  if (FLAGS_use_only_legacy_fonts) {
545  // Restrict view to list of fonts in legacy_fonts.h
546  tprintf("Using list of legacy fonts only\n");
547  const int kNumFontLists = 4;
548  for (int i = 0; i < kNumFontLists; ++i) {
549  for (int j = 0; kFontlists[i][j] != nullptr; ++j) {
550  available_fonts_.push_back(kFontlists[i][j]);
551  }
552  }
553  return available_fonts_;
554  }
555 #endif
556 
557  PangoFontFamily** families = nullptr;
558  int n_families = 0;
559  ListFontFamilies(&families, &n_families);
560  for (int i = 0; i < n_families; ++i) {
561  const char* family_name = pango_font_family_get_name(families[i]);
562  tlog(2, "Listing family %s\n", family_name);
563  if (ShouldIgnoreFontFamilyName(family_name)) {
564  continue;
565  }
566 
567  int n_faces;
568  PangoFontFace** faces = nullptr;
569  pango_font_family_list_faces(families[i], &faces, &n_faces);
570  for (int j = 0; j < n_faces; ++j) {
571  PangoFontDescription* desc = pango_font_face_describe(faces[j]);
572  char* desc_str = pango_font_description_to_string(desc);
573  if (IsAvailableFont(desc_str)) {
574  available_fonts_.push_back(desc_str);
575  }
576  pango_font_description_free(desc);
577  g_free(desc_str);
578  }
579  g_free(faces);
580  }
581  g_free(families);
582  std::sort(available_fonts_.begin(), available_fonts_.end());
583  return available_fonts_;
584 }
585 
586 
587 static void CharCoverageMapToBitmap(PangoCoverage* coverage,
588  std::vector<bool>* unichar_bitmap) {
589  const int kMinUnicodeValue = 33;
590  const int kMaxUnicodeValue = 0x10FFFF;
591  unichar_bitmap->resize(kMaxUnicodeValue + 1, false);
592  // Mark off characters that the font can render.
593  for (int i = kMinUnicodeValue; i <= kMaxUnicodeValue; ++i) {
594  if (IsInterchangeValid(i)) {
595  (*unichar_bitmap)[i]
596  = (pango_coverage_get(coverage, i) == PANGO_COVERAGE_EXACT);
597  }
598  }
599 }
600 
601 /* static */
602 void FontUtils::GetAllRenderableCharacters(std::vector<bool>* unichar_bitmap) {
603  const std::vector<std::string>& all_fonts = ListAvailableFonts();
604  return GetAllRenderableCharacters(all_fonts, unichar_bitmap);
605 }
606 
607 /* static */
608 void FontUtils::GetAllRenderableCharacters(const std::string& font_name,
609  std::vector<bool>* unichar_bitmap) {
610  PangoFontInfo font_info(font_name);
611  PangoFont* font = font_info.ToPangoFont();
612  PangoCoverage* coverage = pango_font_get_coverage(font, nullptr);
613  CharCoverageMapToBitmap(coverage, unichar_bitmap);
614  pango_coverage_unref(coverage);
615  g_object_unref(font);
616 }
617 
618 /* static */
619 void FontUtils::GetAllRenderableCharacters(const std::vector<std::string>& fonts,
620  std::vector<bool>* unichar_bitmap) {
621  // Form the union of coverage maps from the fonts
622  PangoCoverage* all_coverage = pango_coverage_new();
623  tlog(1, "Processing %u fonts\n", static_cast<unsigned>(fonts.size()));
624  for (unsigned i = 0; i < fonts.size(); ++i) {
625  PangoFontInfo font_info(fonts[i]);
626  PangoFont* font = font_info.ToPangoFont();
627  PangoCoverage* coverage = pango_font_get_coverage(font, nullptr);
628  // Mark off characters that any font can render.
629  pango_coverage_max(all_coverage, coverage);
630  pango_coverage_unref(coverage);
631  g_object_unref(font);
632  }
633  CharCoverageMapToBitmap(all_coverage, unichar_bitmap);
634  pango_coverage_unref(all_coverage);
635 }
636 
637 
638 // Utilities written to be backward compatible with StringRender
639 
640 /* static */
641 int FontUtils::FontScore(const std::unordered_map<char32, int64_t>& ch_map,
642  const std::string& fontname, int* raw_score,
643  std::vector<bool>* ch_flags) {
644  PangoFontInfo font_info;
645  if (!font_info.ParseFontDescriptionName(fontname)) {
646  tprintf("ERROR: Could not parse %s\n", fontname.c_str());
647  }
648  PangoFont* font = font_info.ToPangoFont();
649  PangoCoverage* coverage = pango_font_get_coverage(font, nullptr);
650 
651  if (ch_flags) {
652  ch_flags->clear();
653  ch_flags->reserve(ch_map.size());
654  }
655  *raw_score = 0;
656  int ok_chars = 0;
657  for (std::unordered_map<char32, int64_t>::const_iterator it = ch_map.begin();
658  it != ch_map.end(); ++it) {
659  bool covered = (IsWhitespace(it->first) ||
660  (pango_coverage_get(coverage, it->first)
661  == PANGO_COVERAGE_EXACT));
662  if (covered) {
663  ++(*raw_score);
664  ok_chars += it->second;
665  }
666  if (ch_flags) {
667  ch_flags->push_back(covered);
668  }
669  }
670  pango_coverage_unref(coverage);
671  g_object_unref(font);
672  return ok_chars;
673 }
674 
675 
676 /* static */
678  const std::unordered_map<char32, int64_t>& ch_map,
679  std::vector<std::pair<const char*, std::vector<bool> > >* fonts) {
680  const double kMinOKFraction = 0.99;
681  // Weighted fraction of characters that must be renderable in a font to make
682  // it OK even if the raw count is not good.
683  const double kMinWeightedFraction = 0.99995;
684 
685  fonts->clear();
686  std::vector<std::vector<bool> > font_flags;
687  std::vector<int> font_scores;
688  std::vector<int> raw_scores;
689  int most_ok_chars = 0;
690  int best_raw_score = 0;
691  const std::vector<std::string>& font_names = FontUtils::ListAvailableFonts();
692  for (unsigned i = 0; i < font_names.size(); ++i) {
693  std::vector<bool> ch_flags;
694  int raw_score = 0;
695  int ok_chars = FontScore(ch_map, font_names[i], &raw_score, &ch_flags);
696  most_ok_chars = std::max(ok_chars, most_ok_chars);
697  best_raw_score = std::max(raw_score, best_raw_score);
698 
699  font_flags.push_back(ch_flags);
700  font_scores.push_back(ok_chars);
701  raw_scores.push_back(raw_score);
702  }
703 
704  // Now select the fonts with a score above a threshold fraction
705  // of both the raw and weighted best scores. To prevent bogus fonts being
706  // selected for CJK, we require a high fraction (kMinOKFraction = 0.99) of
707  // BOTH weighted and raw scores.
708  // In low character-count scripts, the issue is more getting enough fonts,
709  // when only 1 or 2 might have all those rare dingbats etc in them, so we
710  // allow a font with a very high weighted (coverage) score
711  // (kMinWeightedFraction = 0.99995) to be used even if its raw score is poor.
712  int least_good_enough = static_cast<int>(most_ok_chars * kMinOKFraction);
713  int least_raw_enough = static_cast<int>(best_raw_score * kMinOKFraction);
714  int override_enough = static_cast<int>(most_ok_chars * kMinWeightedFraction);
715 
716  std::string font_list;
717  for (unsigned i = 0; i < font_names.size(); ++i) {
718  int score = font_scores[i];
719  int raw_score = raw_scores[i];
720  if ((score >= least_good_enough && raw_score >= least_raw_enough) ||
721  score >= override_enough) {
722  fonts->push_back(std::make_pair(font_names[i].c_str(), font_flags[i]));
723  tlog(1, "OK font %s = %.4f%%, raw = %d = %.2f%%\n",
724  font_names[i].c_str(),
725  100.0 * score / most_ok_chars,
726  raw_score, 100.0 * raw_score / best_raw_score);
727  font_list += font_names[i];
728  font_list += "\n";
729  } else if (score >= least_good_enough || raw_score >= least_raw_enough) {
730  tlog(1, "Runner-up font %s = %.4f%%, raw = %d = %.2f%%\n",
731  font_names[i].c_str(),
732  100.0 * score / most_ok_chars,
733  raw_score, 100.0 * raw_score / best_raw_score);
734  }
735  }
736  return font_list;
737 }
738 
739 /* static */
740 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
741  std::string* font_name, std::vector<std::string>* graphemes) {
742  return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name,
743  graphemes);
744 }
745 
746 /* static */
747 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
748  const std::vector<std::string>& all_fonts,
749  std::string* font_name, std::vector<std::string>* graphemes) {
750  if (font_name) font_name->clear();
751  if (graphemes) graphemes->clear();
752  for (unsigned i = 0; i < all_fonts.size(); ++i) {
753  PangoFontInfo font;
754  std::vector<std::string> found_graphemes;
755  ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_fonts[i]),
756  "Could not parse font desc name %s\n",
757  all_fonts[i].c_str());
758  if (font.CanRenderString(utf8_word, utf8_len, &found_graphemes)) {
759  if (graphemes) graphemes->swap(found_graphemes);
760  if (font_name) *font_name = all_fonts[i];
761  return true;
762  }
763  }
764  return false;
765 }
766 
767 // PangoFontInfo is reinitialized, so clear the static list of fonts.
768 /* static */
769 void FontUtils::ReInit() { available_fonts_.clear(); }
770 
771 // Print info about used font backend
772 /* static */
774  PangoFontMap* font_map = pango_cairo_font_map_get_default();
775  if (pango_cairo_font_map_get_font_type(reinterpret_cast<PangoCairoFontMap*>(
776  font_map)) == CAIRO_FONT_TYPE_TOY) {
777  printf("Using CAIRO_FONT_TYPE_TOY.\n");
778  } else if (pango_cairo_font_map_get_font_type(
779  reinterpret_cast<PangoCairoFontMap*>(font_map)) ==
780  CAIRO_FONT_TYPE_FT) {
781  printf("Using CAIRO_FONT_TYPE_FT.\n");
782  } else if (pango_cairo_font_map_get_font_type(
783  reinterpret_cast<PangoCairoFontMap*>(font_map)) ==
784  CAIRO_FONT_TYPE_WIN32) {
785  printf("Using CAIRO_FONT_TYPE_WIN32.\n");
786  } else if (pango_cairo_font_map_get_font_type(
787  reinterpret_cast<PangoCairoFontMap*>(font_map)) ==
788  CAIRO_FONT_TYPE_QUARTZ) {
789  printf("Using CAIRO_FONT_TYPE_QUARTZ.\n");
790  } else if (pango_cairo_font_map_get_font_type(
791  reinterpret_cast<PangoCairoFontMap*>(font_map)) ==
792  CAIRO_FONT_TYPE_USER) {
793  printf("Using CAIRO_FONT_TYPE_USER.\n");
794  } else if (!font_map) {
795  printf("Can not create pango cairo font map!\n");
796  }
797 }
798 
799 } // namespace tesseract
static bool DeleteMatchingFiles(const char *pattern)
Definition: fileio.cpp:112
#define DISABLE_HEAP_LEAK_CHECK
Definition: util.h:62
static void PangoFontTypeInfo()
signed int char32
Definition: unichar.h:52
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:253
std::string DescriptionName() const
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:223
const int kDefaultResolution
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:229
#define TLOG_IS_ON(level)
Definition: tlog.h:39
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:202
#define tlog(level,...)
Definition: tlog.h:33
static bool IsAvailableFont(const char *font_desc)
bool CanRenderString(const char *utf8_word, int len, std::vector< std::string > *graphemes) const
bool GetSpacingProperties(const std::string &utf8_char, int *x_bearing, int *x_advance) const
#define BOOL_PARAM_FLAG(name, val, comment)
char * utf8_str() const
Definition: unichar.cpp:127
STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", "Overrides fontconfig default temporary dir")
static std::string BestFonts(const std::unordered_map< char32, int64_t > &ch_map, std::vector< std::pair< const char *, std::vector< bool > > > *font_flag)
static int FontScore(const std::unordered_map< char32, int64_t > &ch_map, const std::string &fontname, int *raw_score, std::vector< bool > *ch_flags)
static void WriteStringToFileOrDie(const std::string &str, const std::string &filename)
Definition: fileio.cpp:53
static bool SelectFont(const char *utf8_word, const int utf8_len, std::string *font_name, std::vector< std::string > *graphemes)
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
#define ASSERT_HOST_MSG(x,...)
Definition: errcode.h:90
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:206
static std::string JoinPath(const std::string &prefix, const std::string &suffix)
Definition: fileio.cpp:82
bool ParseFontDescriptionName(const std::string &name)
static void HardInitFontConfig(const std::string &fonts_dir, const std::string &cache_dir)
static void GetAllRenderableCharacters(std::vector< bool > *unichar_bitmap)
static const std::vector< std::string > & ListAvailableFonts()
int DropUncoveredChars(std::string *utf8_text) const