tesseract  5.0.0-alpha-619-ge9db
pango_font_info.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pango_font_info.cpp
3  * Description: Font-related objects and helper functions
4  * Author: Ranjith Unnikrishnan
5  *
6  * (C) Copyright 2013, Google Inc.
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  * http://www.apache.org/licenses/LICENSE-2.0
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #if (defined __MINGW32__) || (defined __CYGWIN__)
25 // workaround for stdlib.h and putenv
26 #undef __STRICT_ANSI__
27 #endif
28 
29 #include <cstdlib>
30 #include <cstdio>
31 #include <cstring>
32 #ifndef _MSC_VER
33 #include <sys/param.h>
34 #endif
35 #include <algorithm>
36 
37 #include "pango_font_info.h"
38 #include "commandlineflags.h"
39 #include "fileio.h"
40 #include "normstrngs.h"
41 #include "tlog.h"
42 #include <tesseract/unichar.h>
43 #include "util.h"
44 #include "pango/pango.h"
45 #include "pango/pangocairo.h"
46 #include "pango/pangofc-font.h"
47 
48 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
49  "Overrides fontconfig default temporary dir");
50 
51 #ifdef GOOGLE_TESSERACT
52 #include "ocr/trainingdata/typesetting/legacy_fonts.h"
53 BOOL_PARAM_FLAG(use_only_legacy_fonts, false,
54  "Overrides --fonts_dir and sets the known universe of fonts to"
55  "the list in legacy_fonts.h");
56 
57 STRING_PARAM_FLAG(fonts_dir, "/auto/ocr-data/tesstraining/fonts",
58  "Overrides system default font location");
59 #else
60 using std::pair;
61 STRING_PARAM_FLAG(fonts_dir, "",
62  "If empty it use system default. Otherwise it overrides"
63  " system default font location");
64 #endif
65 
66 namespace tesseract {
67 
68 // Default assumed output resolution. Required only for providing font metrics
69 // in pixels.
70 const int kDefaultResolution = 300;
71 
72 std::string PangoFontInfo::fonts_dir_;
73 std::string PangoFontInfo::cache_dir_;
74 
76  : desc_(nullptr), resolution_(kDefaultResolution) {
77  Clear();
78 }
79 
81  : desc_(nullptr), resolution_(kDefaultResolution) {
82  if (!ParseFontDescriptionName(desc)) {
83  tprintf("ERROR: Could not parse %s\n", desc.c_str());
84  Clear();
85  }
86 }
87 
88 void PangoFontInfo::Clear() {
89  font_size_ = 0;
90  family_name_.clear();
91  font_type_ = UNKNOWN;
92  if (desc_) {
93  pango_font_description_free(desc_);
94  desc_ = nullptr;
95  }
96 }
97 
98 PangoFontInfo::~PangoFontInfo() { pango_font_description_free(desc_); }
99 
101  if (!desc_) return "";
102  char* desc_str = pango_font_description_to_string(desc_);
103  std::string desc_name(desc_str);
104  g_free(desc_str);
105  return desc_name;
106 }
107 
108 // If not already initialized, initializes FontConfig by setting its
109 // environment variable and creating a fonts.conf file that points to the
110 // FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir.
111 /* static */
113  if (fonts_dir_.empty()) {
114  HardInitFontConfig(FLAGS_fonts_dir.c_str(),
115  FLAGS_fontconfig_tmpdir.c_str());
116  }
117 }
118 
119 // Re-initializes font config, whether or not already initialized.
120 // If already initialized, any existing cache is deleted, just to be sure.
121 /* static */
122 void PangoFontInfo::HardInitFontConfig(const std::string& fonts_dir,
123  const std::string& cache_dir) {
124  if (!cache_dir_.empty()) {
126  File::JoinPath(cache_dir_.c_str(), "*cache-?").c_str());
127  }
128  const int MAX_FONTCONF_FILESIZE = 1024;
129  char fonts_conf_template[MAX_FONTCONF_FILESIZE];
130  cache_dir_ = cache_dir;
131  fonts_dir_ = fonts_dir;
132  snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
133  "<?xml version=\"1.0\"?>\n"
134  "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
135  "<fontconfig>\n"
136  "<dir>%s</dir>\n"
137  "<cachedir>%s</cachedir>\n"
138  "<config></config>\n"
139  "</fontconfig>",
140  fonts_dir.c_str(), cache_dir_.c_str());
141  std::string fonts_conf_file = File::JoinPath(cache_dir_.c_str(), "fonts.conf");
142  File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
143 #ifdef _WIN32
144  std::string env("FONTCONFIG_PATH=");
145  env.append(cache_dir_.c_str());
146  _putenv(env.c_str());
147  _putenv("LANG=en_US.utf8");
148 #else
149  setenv("FONTCONFIG_PATH", cache_dir_.c_str(), true);
150  // Fix the locale so that the reported font names are consistent.
151  setenv("LANG", "en_US.utf8", true);
152 #endif // _WIN32
153 
154  if (FcInitReinitialize() != FcTrue) {
155  tprintf("FcInitiReinitialize failed!!\n");
156  }
158  // Clear Pango's font cache too.
159  pango_cairo_font_map_set_default(nullptr);
160 }
161 
162 static void ListFontFamilies(PangoFontFamily*** families,
163  int* n_families) {
165  PangoFontMap* font_map = pango_cairo_font_map_get_default();
167  pango_font_map_list_families(font_map, families, n_families);
168 }
169 
170 bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) {
171  Clear();
172  const char* family = pango_font_description_get_family(desc);
173  if (!family) {
174  char* desc_str = pango_font_description_to_string(desc);
175  tprintf("WARNING: Could not parse family name from description: '%s'\n",
176  desc_str);
177  g_free(desc_str);
178  return false;
179  }
180  family_name_ = std::string(family);
181  desc_ = pango_font_description_copy(desc);
182 
183  // Set font size in points
184  font_size_ = pango_font_description_get_size(desc);
185  if (!pango_font_description_get_size_is_absolute(desc)) {
186  font_size_ /= PANGO_SCALE;
187  }
188 
189  return true;
190 }
191 
193  PangoFontDescription *desc = pango_font_description_from_string(name.c_str());
194  bool success = ParseFontDescription(desc);
195  pango_font_description_free(desc);
196  return success;
197 }
198 
199 // Returns the PangoFont structure corresponding to the closest available font
200 // in the font map. Note that if the font is wholly missing, this could
201 // correspond to a completely different font family and face.
202 PangoFont* PangoFontInfo::ToPangoFont() const {
204  PangoFontMap* font_map = pango_cairo_font_map_get_default();
205  PangoContext* context = pango_context_new();
206  pango_cairo_context_set_resolution(context, resolution_);
207  pango_context_set_font_map(context, font_map);
208  PangoFont* font = nullptr;
209  {
211  font = pango_font_map_load_font(font_map, context, desc_);
212  }
213  g_object_unref(context);
214  return font;
215 }
216 
217 bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const {
218  PangoFont* font = ToPangoFont();
219  if (font == nullptr) {
220  // Font not found.
221  return false;
222  }
223  PangoCoverage* coverage = pango_font_get_coverage(font, nullptr);
224  for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length);
225  it != UNICHAR::end(utf8_text, byte_length);
226  ++it) {
227  if (IsWhitespace(*it) || pango_is_zero_width(*it))
228  continue;
229  if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
230  char tmp[5];
231  int len = it.get_utf8(tmp);
232  tmp[len] = '\0';
233  tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
234  pango_coverage_unref(coverage);
235  g_object_unref(font);
236  return false;
237  }
238  }
239  pango_coverage_unref(coverage);
240  g_object_unref(font);
241  return true;
242 }
243 
244 // This variant of strncpy permits src and dest to overlap. It will copy the
245 // first byte first.
246 static char* my_strnmove(char* dest, const char* src, size_t n) {
247  char* ret = dest;
248 
249  // Copy characters until n reaches zero or the src byte is a nul.
250  do {
251  *dest = *src;
252  --n;
253  ++dest;
254  ++src;
255  } while (n && src[0]);
256 
257  // If we reached a nul byte and there are more 'n' left, zero them out.
258  while (n) {
259  *dest = '\0';
260  --n;
261  ++dest;
262  }
263  return ret;
264 }
265 
266 int PangoFontInfo::DropUncoveredChars(std::string* utf8_text) const {
267  int num_dropped_chars = 0;
268  PangoFont* font = ToPangoFont();
269  if (font == nullptr) {
270  // Font not found, drop all characters.
271  num_dropped_chars = utf8_text->length();
272  utf8_text->resize(0);
273  return num_dropped_chars;
274  }
275  PangoCoverage* coverage = pango_font_get_coverage(font, nullptr);
276  // Maintain two iterators that point into the string. For space efficiency, we
277  // will repeatedly copy one covered UTF8 character from one to the other, and
278  // at the end resize the string to the right length.
279  char* out = const_cast<char*>(utf8_text->c_str());
280  const UNICHAR::const_iterator it_begin =
281  UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
282  const UNICHAR::const_iterator it_end =
283  UNICHAR::end(utf8_text->c_str(), utf8_text->length());
284  for (UNICHAR::const_iterator it = it_begin; it != it_end;) {
285  // Skip bad utf-8.
286  if (!it.is_legal()) {
287  ++it; // One suitable error message will still be issued.
288  continue;
289  }
290  int unicode = *it;
291  int utf8_len = it.utf8_len();
292  const char* utf8_char = it.utf8_data();
293  // Move it forward before the data gets modified.
294  ++it;
295  if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&
296  pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {
297  if (TLOG_IS_ON(2)) {
298  UNICHAR unichar(unicode);
299  char* str = unichar.utf8_str();
300  tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode);
301  delete[] str;
302  }
303  ++num_dropped_chars;
304  continue;
305  }
306  my_strnmove(out, utf8_char, utf8_len);
307  out += utf8_len;
308  }
309  pango_coverage_unref(coverage);
310  g_object_unref(font);
311  utf8_text->resize(out - utf8_text->c_str());
312  return num_dropped_chars;
313 }
314 
316  int* x_bearing, int* x_advance) const {
317  // Convert to equivalent PangoFont structure
318  PangoFont* font = ToPangoFont();
319  // Find the glyph index in the font for the supplied utf8 character.
320  int total_advance = 0;
321  int min_bearing = 0;
322  // Handle multi-unicode strings by reporting the left-most position of the
323  // x-bearing, and right-most position of the x-advance if the string were to
324  // be rendered.
325  const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_char.c_str(),
326  utf8_char.length());
327  const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_char.c_str(),
328  utf8_char.length());
329  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
330  PangoGlyph glyph_index = pango_fc_font_get_glyph(
331  reinterpret_cast<PangoFcFont*>(font), *it);
332  if (!glyph_index) {
333  // Glyph for given unicode character doesn't exist in font.
334  g_object_unref(font);
335  return false;
336  }
337  // Find the ink glyph extents for the glyph
338  PangoRectangle ink_rect, logical_rect;
339  pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect);
340  pango_extents_to_pixels(&ink_rect, nullptr);
341  pango_extents_to_pixels(&logical_rect, nullptr);
342 
343  int bearing = total_advance + PANGO_LBEARING(ink_rect);
344  if (it == it_begin || bearing < min_bearing) {
345  min_bearing = bearing;
346  }
347  total_advance += PANGO_RBEARING(logical_rect);
348  }
349  *x_bearing = min_bearing;
350  *x_advance = total_advance;
351  g_object_unref(font);
352  return true;
353 }
354 
355 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len) const {
356  std::vector<std::string> graphemes;
357  return CanRenderString(utf8_word, len, &graphemes);
358 }
359 
360 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
361  std::vector<std::string>* graphemes) const {
362  if (graphemes) graphemes->clear();
363  // We check for font coverage of the text first, as otherwise Pango could
364  // (undesirably) fall back to another font that does have the required
365  // coverage.
366  if (!CoversUTF8Text(utf8_word, len)) {
367  return false;
368  }
369  // U+25CC dotted circle character that often (but not always) gets rendered
370  // when there is an illegal grapheme sequence.
371  const char32 kDottedCircleGlyph = 9676;
372  bool bad_glyph = false;
373  PangoFontMap* font_map = pango_cairo_font_map_get_default();
374  PangoContext* context = pango_context_new();
375  pango_context_set_font_map(context, font_map);
376  PangoLayout* layout;
377  {
378  // Pango is not releasing the cached layout.
380  layout = pango_layout_new(context);
381  }
382  if (desc_) {
383  pango_layout_set_font_description(layout, desc_);
384  } else {
385  PangoFontDescription *desc = pango_font_description_from_string(
386  DescriptionName().c_str());
387  pango_layout_set_font_description(layout, desc);
388  pango_font_description_free(desc);
389  }
390  pango_layout_set_text(layout, utf8_word, len);
391  PangoLayoutIter* run_iter = nullptr;
392  { // Fontconfig caches some information here that is not freed before exit.
394  run_iter = pango_layout_get_iter(layout);
395  }
396  do {
397  PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
398  if (!run) {
399  tlog(2, "Found end of line nullptr run marker\n");
400  continue;
401  }
402  PangoGlyph dotted_circle_glyph;
403  PangoFont* font = run->item->analysis.font;
404 
405 #ifdef _WIN32
406  PangoGlyphString* glyphs = pango_glyph_string_new();
407  const char s[] = "\xc2\xa7";
408  pango_shape(s, strlen(s), &(run->item->analysis), glyphs);
409  dotted_circle_glyph = glyphs->glyphs[0].glyph;
410 #else // TODO: Do we need separate solution for non win build?
411  dotted_circle_glyph = pango_fc_font_get_glyph(
412  reinterpret_cast<PangoFcFont*>(font), kDottedCircleGlyph);
413 #endif
414 
415  if (TLOG_IS_ON(2)) {
416  PangoFontDescription* desc = pango_font_describe(font);
417  char* desc_str = pango_font_description_to_string(desc);
418  tlog(2, "Desc of font in run: %s\n", desc_str);
419  g_free(desc_str);
420  pango_font_description_free(desc);
421  }
422 
423  PangoGlyphItemIter cluster_iter;
424  gboolean have_cluster;
425  for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
426  run, utf8_word);
427  have_cluster && !bad_glyph;
428  have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
429  const int start_byte_index = cluster_iter.start_index;
430  const int end_byte_index = cluster_iter.end_index;
431  int start_glyph_index = cluster_iter.start_glyph;
432  int end_glyph_index = cluster_iter.end_glyph;
433  std::string cluster_text = std::string(utf8_word + start_byte_index,
434  end_byte_index - start_byte_index);
435  if (graphemes) graphemes->push_back(cluster_text);
436  if (IsUTF8Whitespace(cluster_text.c_str())) {
437  tlog(2, "Skipping whitespace\n");
438  continue;
439  }
440  if (TLOG_IS_ON(2)) {
441  printf("start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d ",
442  start_byte_index, end_byte_index,
443  start_glyph_index, end_glyph_index);
444  }
445  for (int i = start_glyph_index,
446  step = (end_glyph_index > start_glyph_index) ? 1 : -1;
447  !bad_glyph && i != end_glyph_index; i+= step) {
448  const bool unknown_glyph =
449  (cluster_iter.glyph_item->glyphs->glyphs[i].glyph &
450  PANGO_GLYPH_UNKNOWN_FLAG);
451  const bool illegal_glyph =
452  (cluster_iter.glyph_item->glyphs->glyphs[i].glyph ==
453  dotted_circle_glyph);
454  bad_glyph = unknown_glyph || illegal_glyph;
455  if (TLOG_IS_ON(2)) {
456  printf("(%d=%d)", cluster_iter.glyph_item->glyphs->glyphs[i].glyph,
457  bad_glyph ? 1 : 0);
458  }
459  }
460  if (TLOG_IS_ON(2)) {
461  printf(" '%s'\n", cluster_text.c_str());
462  }
463  if (bad_glyph)
464  tlog(1, "Found illegal glyph!\n");
465  }
466 #ifdef _WIN32
467  pango_glyph_string_free(glyphs);
468 #endif
469  } while (!bad_glyph && pango_layout_iter_next_run(run_iter));
470 
471  pango_layout_iter_free(run_iter);
472  g_object_unref(context);
473  g_object_unref(layout);
474  if (bad_glyph && graphemes) graphemes->clear();
475  return !bad_glyph;
476 }
477 
478 
479 // ------------------------ FontUtils ------------------------------------
480 std::vector<std::string> FontUtils::available_fonts_; // cache list
481 
482 // Returns whether the specified font description is available in the fonts
483 // directory.
484 //
485 // The generated list of font families and faces includes "synthesized" font
486 // faces that are not truly loadable. Pango versions >=1.18 have a
487 // pango_font_face_is_synthesized method that can be used to prune the list.
488 // Until then, we are restricted to using a hack where we try to load the font
489 // from the font_map, and then check what we loaded to see if it has the
490 // description we expected. If it is not, then the font is deemed unavailable.
491 //
492 // TODO: This function reports also some not synthesized fonts as not available
493 // e.g. 'Bitstream Charter Medium Italic', 'LMRoman17', so we need this hack
494 // until other solution is found.
495 /* static */
496 bool FontUtils::IsAvailableFont(const char* input_query_desc,
497  std::string* best_match) {
498  std::string query_desc(input_query_desc);
499  PangoFontDescription *desc = pango_font_description_from_string(
500  query_desc.c_str());
501  PangoFont* selected_font = nullptr;
502  {
504  PangoFontMap* font_map = pango_cairo_font_map_get_default();
505  PangoContext* context = pango_context_new();
506  pango_context_set_font_map(context, font_map);
507  {
509  selected_font = pango_font_map_load_font(font_map, context, desc);
510  }
511  g_object_unref(context);
512  }
513  if (selected_font == nullptr) {
514  pango_font_description_free(desc);
515  tlog(4, "** Font '%s' failed to load from font map!\n", input_query_desc);
516  return false;
517  }
518  PangoFontDescription* selected_desc = pango_font_describe(selected_font);
519 
520  bool equal = pango_font_description_equal(desc, selected_desc);
521  tlog(3, "query weight = %d \t selected weight =%d\n",
522  pango_font_description_get_weight(desc),
523  pango_font_description_get_weight(selected_desc));
524 
525  char* selected_desc_str = pango_font_description_to_string(selected_desc);
526  tlog(2, "query_desc: '%s' Selected: '%s'\n", query_desc.c_str(),
527  selected_desc_str);
528  if (!equal && best_match != nullptr) {
529  *best_match = selected_desc_str;
530  // Clip the ending ' 0' if there is one. It seems that, if there is no
531  // point size on the end of the fontname, then Pango always appends ' 0'.
532  int len = best_match->size();
533  if (len > 2 && best_match->at(len - 1) == '0' &&
534  best_match->at(len - 2) == ' ') {
535  *best_match = best_match->substr(0, len - 2);
536  }
537  }
538  g_free(selected_desc_str);
539  pango_font_description_free(selected_desc);
540  g_object_unref(selected_font);
541  pango_font_description_free(desc);
542  if (!equal)
543  tlog(4, "** Font '%s' failed pango_font_description_equal!\n",
544  input_query_desc);
545  return equal;
546 }
547 
548 static bool ShouldIgnoreFontFamilyName(const char* query) {
549  static const char* kIgnoredFamilyNames[] = {"Sans", "Serif", "Monospace",
550  nullptr};
551  const char** list = kIgnoredFamilyNames;
552  for (; *list != nullptr; ++list) {
553  if (!strcmp(*list, query))
554  return true;
555  }
556  return false;
557 }
558 
559 // Outputs description names of available fonts.
560 /* static */
561 const std::vector<std::string>& FontUtils::ListAvailableFonts() {
562  if (!available_fonts_.empty()) {
563  return available_fonts_;
564  }
565 #ifdef GOOGLE_TESSERACT
566  if (FLAGS_use_only_legacy_fonts) {
567  // Restrict view to list of fonts in legacy_fonts.h
568  tprintf("Using list of legacy fonts only\n");
569  const int kNumFontLists = 4;
570  for (int i = 0; i < kNumFontLists; ++i) {
571  for (int j = 0; kFontlists[i][j] != nullptr; ++j) {
572  available_fonts_.push_back(kFontlists[i][j]);
573  }
574  }
575  return available_fonts_;
576  }
577 #endif
578 
579  PangoFontFamily** families = nullptr;
580  int n_families = 0;
581  ListFontFamilies(&families, &n_families);
582  for (int i = 0; i < n_families; ++i) {
583  const char* family_name = pango_font_family_get_name(families[i]);
584  tlog(2, "Listing family %s\n", family_name);
585  if (ShouldIgnoreFontFamilyName(family_name)) {
586  continue;
587  }
588 
589  int n_faces;
590  PangoFontFace** faces = nullptr;
591  pango_font_family_list_faces(families[i], &faces, &n_faces);
592  for (int j = 0; j < n_faces; ++j) {
593  PangoFontDescription* desc = pango_font_face_describe(faces[j]);
594  char* desc_str = pango_font_description_to_string(desc);
595  // "synthesized" font faces that are not truly loadable, so we skip it
596  if (!pango_font_face_is_synthesized(faces[j])
597  && IsAvailableFont(desc_str)) {
598  available_fonts_.push_back(desc_str);
599  }
600  pango_font_description_free(desc);
601  g_free(desc_str);
602  }
603  g_free(faces);
604  }
605  g_free(families);
606  std::sort(available_fonts_.begin(), available_fonts_.end());
607  return available_fonts_;
608 }
609 
610 
611 static void CharCoverageMapToBitmap(PangoCoverage* coverage,
612  std::vector<bool>* unichar_bitmap) {
613  const int kMinUnicodeValue = 33;
614  const int kMaxUnicodeValue = 0x10FFFF;
615  unichar_bitmap->resize(kMaxUnicodeValue + 1, false);
616  // Mark off characters that the font can render.
617  for (int i = kMinUnicodeValue; i <= kMaxUnicodeValue; ++i) {
618  if (IsInterchangeValid(i)) {
619  (*unichar_bitmap)[i]
620  = (pango_coverage_get(coverage, i) == PANGO_COVERAGE_EXACT);
621  }
622  }
623 }
624 
625 /* static */
626 void FontUtils::GetAllRenderableCharacters(std::vector<bool>* unichar_bitmap) {
627  const std::vector<std::string>& all_fonts = ListAvailableFonts();
628  return GetAllRenderableCharacters(all_fonts, unichar_bitmap);
629 }
630 
631 /* static */
633  std::vector<bool>* unichar_bitmap) {
634  PangoFontInfo font_info(font_name);
635  PangoFont* font = font_info.ToPangoFont();
636  if (font != nullptr) {
637  // Font found.
638  PangoCoverage* coverage = pango_font_get_coverage(font, nullptr);
639  CharCoverageMapToBitmap(coverage, unichar_bitmap);
640  pango_coverage_unref(coverage);
641  g_object_unref(font);
642  }
643 }
644 
645 /* static */
646 void FontUtils::GetAllRenderableCharacters(const std::vector<std::string>& fonts,
647  std::vector<bool>* unichar_bitmap) {
648  // Form the union of coverage maps from the fonts
649  PangoCoverage* all_coverage = pango_coverage_new();
650  tlog(1, "Processing %u fonts\n", static_cast<unsigned>(fonts.size()));
651  for (unsigned i = 0; i < fonts.size(); ++i) {
652  PangoFontInfo font_info(fonts[i]);
653  PangoFont* font = font_info.ToPangoFont();
654  if (font != nullptr) {
655  // Font found.
656  PangoCoverage* coverage = pango_font_get_coverage(font, nullptr);
657  // Mark off characters that any font can render.
658  pango_coverage_max(all_coverage, coverage);
659  pango_coverage_unref(coverage);
660  g_object_unref(font);
661  }
662  }
663  CharCoverageMapToBitmap(all_coverage, unichar_bitmap);
664  pango_coverage_unref(all_coverage);
665 }
666 
667 
668 // Utilities written to be backward compatible with StringRender
669 
670 /* static */
671 int FontUtils::FontScore(const std::unordered_map<char32, int64_t>& ch_map,
672  const std::string& fontname, int* raw_score,
673  std::vector<bool>* ch_flags) {
674  PangoFontInfo font_info;
675  if (!font_info.ParseFontDescriptionName(fontname)) {
676  tprintf("ERROR: Could not parse %s\n", fontname.c_str());
677  }
678  PangoFont* font = font_info.ToPangoFont();
679  PangoCoverage* coverage = nullptr;
680  if (font != nullptr) coverage = pango_font_get_coverage(font, nullptr);
681  if (ch_flags) {
682  ch_flags->clear();
683  ch_flags->reserve(ch_map.size());
684  }
685  *raw_score = 0;
686  int ok_chars = 0;
687  for (std::unordered_map<char32, int64_t>::const_iterator it = ch_map.begin();
688  it != ch_map.end(); ++it) {
689  bool covered = (coverage != nullptr) && (IsWhitespace(it->first) ||
690  (pango_coverage_get(coverage, it->first)
691  == PANGO_COVERAGE_EXACT));
692  if (covered) {
693  ++(*raw_score);
694  ok_chars += it->second;
695  }
696  if (ch_flags) {
697  ch_flags->push_back(covered);
698  }
699  }
700  pango_coverage_unref(coverage);
701  g_object_unref(font);
702  return ok_chars;
703 }
704 
705 
706 /* static */
708  const std::unordered_map<char32, int64_t>& ch_map,
709  std::vector<std::pair<const char*, std::vector<bool> > >* fonts) {
710  const double kMinOKFraction = 0.99;
711  // Weighted fraction of characters that must be renderable in a font to make
712  // it OK even if the raw count is not good.
713  const double kMinWeightedFraction = 0.99995;
714 
715  fonts->clear();
716  std::vector<std::vector<bool> > font_flags;
717  std::vector<int> font_scores;
718  std::vector<int> raw_scores;
719  int most_ok_chars = 0;
720  int best_raw_score = 0;
721  const std::vector<std::string>& font_names = FontUtils::ListAvailableFonts();
722  for (unsigned i = 0; i < font_names.size(); ++i) {
723  std::vector<bool> ch_flags;
724  int raw_score = 0;
725  int ok_chars = FontScore(ch_map, font_names[i], &raw_score, &ch_flags);
726  most_ok_chars = std::max(ok_chars, most_ok_chars);
727  best_raw_score = std::max(raw_score, best_raw_score);
728 
729  font_flags.push_back(ch_flags);
730  font_scores.push_back(ok_chars);
731  raw_scores.push_back(raw_score);
732  }
733 
734  // Now select the fonts with a score above a threshold fraction
735  // of both the raw and weighted best scores. To prevent bogus fonts being
736  // selected for CJK, we require a high fraction (kMinOKFraction = 0.99) of
737  // BOTH weighted and raw scores.
738  // In low character-count scripts, the issue is more getting enough fonts,
739  // when only 1 or 2 might have all those rare dingbats etc in them, so we
740  // allow a font with a very high weighted (coverage) score
741  // (kMinWeightedFraction = 0.99995) to be used even if its raw score is poor.
742  int least_good_enough = static_cast<int>(most_ok_chars * kMinOKFraction);
743  int least_raw_enough = static_cast<int>(best_raw_score * kMinOKFraction);
744  int override_enough = static_cast<int>(most_ok_chars * kMinWeightedFraction);
745 
746  std::string font_list;
747  for (unsigned i = 0; i < font_names.size(); ++i) {
748  int score = font_scores[i];
749  int raw_score = raw_scores[i];
750  if ((score >= least_good_enough && raw_score >= least_raw_enough) ||
751  score >= override_enough) {
752  fonts->push_back(std::make_pair(font_names[i].c_str(), font_flags[i]));
753  tlog(1, "OK font %s = %.4f%%, raw = %d = %.2f%%\n",
754  font_names[i].c_str(),
755  100.0 * score / most_ok_chars,
756  raw_score, 100.0 * raw_score / best_raw_score);
757  font_list += font_names[i];
758  font_list += "\n";
759  } else if (score >= least_good_enough || raw_score >= least_raw_enough) {
760  tlog(1, "Runner-up font %s = %.4f%%, raw = %d = %.2f%%\n",
761  font_names[i].c_str(),
762  100.0 * score / most_ok_chars,
763  raw_score, 100.0 * raw_score / best_raw_score);
764  }
765  }
766  return font_list;
767 }
768 
769 /* static */
770 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
771  std::string* font_name, std::vector<std::string>* graphemes) {
772  return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name,
773  graphemes);
774 }
775 
776 /* static */
777 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
778  const std::vector<std::string>& all_fonts,
779  std::string* font_name, std::vector<std::string>* graphemes) {
780  if (font_name) font_name->clear();
781  if (graphemes) graphemes->clear();
782  for (unsigned i = 0; i < all_fonts.size(); ++i) {
783  PangoFontInfo font;
784  std::vector<std::string> found_graphemes;
785  ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_fonts[i]),
786  "Could not parse font desc name %s\n",
787  all_fonts[i].c_str());
788  if (font.CanRenderString(utf8_word, utf8_len, &found_graphemes)) {
789  if (graphemes) graphemes->swap(found_graphemes);
790  if (font_name) *font_name = all_fonts[i];
791  return true;
792  }
793  }
794  return false;
795 }
796 
797 // PangoFontInfo is reinitialized, so clear the static list of fonts.
798 /* static */
799 void FontUtils::ReInit() { available_fonts_.clear(); }
800 
801 // Print info about used font backend
802 /* static */
804  PangoFontMap* font_map = pango_cairo_font_map_get_default();
805  if (pango_cairo_font_map_get_font_type(reinterpret_cast<PangoCairoFontMap*>(
806  font_map)) == CAIRO_FONT_TYPE_TOY) {
807  printf("Using CAIRO_FONT_TYPE_TOY.\n");
808  } else if (pango_cairo_font_map_get_font_type(
809  reinterpret_cast<PangoCairoFontMap*>(font_map)) ==
810  CAIRO_FONT_TYPE_FT) {
811  printf("Using CAIRO_FONT_TYPE_FT.\n");
812  } else if (pango_cairo_font_map_get_font_type(
813  reinterpret_cast<PangoCairoFontMap*>(font_map)) ==
814  CAIRO_FONT_TYPE_WIN32) {
815  printf("Using CAIRO_FONT_TYPE_WIN32.\n");
816  } else if (pango_cairo_font_map_get_font_type(
817  reinterpret_cast<PangoCairoFontMap*>(font_map)) ==
818  CAIRO_FONT_TYPE_QUARTZ) {
819  printf("Using CAIRO_FONT_TYPE_QUARTZ.\n");
820  } else if (pango_cairo_font_map_get_font_type(
821  reinterpret_cast<PangoCairoFontMap*>(font_map)) ==
822  CAIRO_FONT_TYPE_USER) {
823  printf("Using CAIRO_FONT_TYPE_USER.\n");
824  } else if (!font_map) {
825  printf("Can not create pango cairo font map!\n");
826  }
827 }
828 
829 } // namespace tesseract
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::PangoFontInfo::GetSpacingProperties
bool GetSpacingProperties(const std::string &utf8_char, int *x_bearing, int *x_advance) const
Definition: pango_font_info.cpp:314
tesseract::IsWhitespace
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:239
BOOL_PARAM_FLAG
#define BOOL_PARAM_FLAG(name, val, comment)
Definition: commandlineflags.h:33
tesseract::UNICHAR::begin
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:204
tesseract::UNICHAR::end
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:208
tesseract::PangoFontInfo::CanRenderString
bool CanRenderString(const char *utf8_word, int len, std::vector< std::string > *graphemes) const
Definition: pango_font_info.cpp:359
tesseract::PangoFontInfo::ParseFontDescriptionName
bool ParseFontDescriptionName(const std::string &name)
Definition: pango_font_info.cpp:191
tesseract::IsInterchangeValid
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:269
TLOG_IS_ON
#define TLOG_IS_ON(level)
Definition: tlog.h:38
tesseract::PangoFontInfo::HardInitFontConfig
static void HardInitFontConfig(const std::string &fonts_dir, const std::string &cache_dir)
Definition: pango_font_info.cpp:121
tesseract::UNICHAR::const_iterator
Definition: unichar.h:109
tesseract::UNICHAR::const_iterator::utf8_len
int utf8_len() const
Definition: unichar.cpp:190
tesseract::PangoFontInfo::SoftInitFontConfig
static void SoftInitFontConfig()
Definition: pango_font_info.cpp:111
util.h
tlog
#define tlog(level,...)
Definition: tlog.h:32
tesseract::kDefaultResolution
const int kDefaultResolution
Definition: pango_font_info.cpp:69
tesseract::UNICHAR
Definition: unichar.h:59
ASSERT_HOST_MSG
#define ASSERT_HOST_MSG(x,...)
Definition: errcode.h:91
tesseract::File::JoinPath
static std::string JoinPath(const std::string &prefix, const std::string &suffix)
Definition: fileio.cpp:98
fileio.h
tesseract::PangoFontInfo
Definition: pango_font_info.h:39
tesseract::PangoFontInfo::~PangoFontInfo
~PangoFontInfo()
Definition: pango_font_info.cpp:97
tesseract::char32
signed int char32
Definition: unichar.h:53
STRING_PARAM_FLAG
STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", "Overrides fontconfig default temporary dir")
pango_font_info.h
tesseract::PangoFontInfo::DescriptionName
std::string DescriptionName() const
Definition: pango_font_info.cpp:99
tesseract::File::WriteStringToFileOrDie
static void WriteStringToFileOrDie(const std::string &str, const std::string &filename)
Definition: fileio.cpp:68
tesseract::PangoFontInfo::DropUncoveredChars
int DropUncoveredChars(std::string *utf8_text) const
Definition: pango_font_info.cpp:265
tesseract::UNICHAR::utf8_str
char * utf8_str() const
Definition: unichar.cpp:129
tesseract
Definition: baseapi.h:65
tesseract::FontUtils::ListAvailableFonts
static const std::vector< std::string > & ListAvailableFonts()
Definition: pango_font_info.cpp:560
tesseract::PangoFontInfo::UNKNOWN
Definition: pango_font_info.h:42
normstrngs.h
tesseract::FontUtils::BestFonts
static std::string BestFonts(const std::unordered_map< char32, int64_t > &ch_map, std::vector< std::pair< const char *, std::vector< bool > > > *font_flag)
Definition: pango_font_info.cpp:706
tesseract::FontUtils::FontScore
static int FontScore(const std::unordered_map< char32, int64_t > &ch_map, const std::string &fontname, int *raw_score, std::vector< bool > *ch_flags)
Definition: pango_font_info.cpp:670
tesseract::IsUTF8Whitespace
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:245
tesseract::PangoFontInfo::PangoFontInfo
PangoFontInfo()
Definition: pango_font_info.cpp:74
tesseract::FontUtils::PangoFontTypeInfo
static void PangoFontTypeInfo()
Definition: pango_font_info.cpp:802
tesseract::FontUtils::SelectFont
static bool SelectFont(const char *utf8_word, const int utf8_len, std::string *font_name, std::vector< std::string > *graphemes)
Definition: pango_font_info.cpp:769
unichar.h
tesstrain_utils.dest
dest
Definition: tesstrain_utils.py:139
tesseract::FontUtils::ReInit
static void ReInit()
Definition: pango_font_info.cpp:798
tlog.h
tesseract::FontUtils::GetAllRenderableCharacters
static void GetAllRenderableCharacters(std::vector< bool > *unichar_bitmap)
Definition: pango_font_info.cpp:625
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
commandlineflags.h
tesseract::File::DeleteMatchingFiles
static bool DeleteMatchingFiles(const char *pattern)
Definition: fileio.cpp:133
DISABLE_HEAP_LEAK_CHECK
#define DISABLE_HEAP_LEAK_CHECK
Definition: util.h:60
tesseract::FontUtils::IsAvailableFont
static bool IsAvailableFont(const char *font_desc)
Definition: pango_font_info.h:149
tesseract::PangoFontInfo::CoversUTF8Text
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
Definition: pango_font_info.cpp:216