tesseract  5.0.0-alpha-619-ge9db
pango_font_info_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include <cstdio>
13 #include <string>
14 #include <pango/pango.h>
15 #include "include_gunit.h"
16 #include "commandlineflags.h"
17 #include "fileio.h"
18 #include "pango_font_info.h"
19 #include "absl/strings/str_cat.h" // for absl::StrCat
20 #include "gmock/gmock-matchers.h" // for EXPECT_THAT
21 #include "util/utf8/unicodetext.h" // for UnicodeText
22 
23 DECLARE_STRING_PARAM_FLAG(fonts_dir);
24 DECLARE_STRING_PARAM_FLAG(fontconfig_tmpdir);
25 DECLARE_BOOL_PARAM_FLAG(use_only_legacy_fonts);
26 
27 namespace {
28 
29 using tesseract::File;
32 
33 // Fonts in testdata directory
34 const char* kExpectedFontNames[] = {
35  "Arab",
36  "Arial Bold Italic",
37  "DejaVu Sans Ultra-Light",
38  "Lohit Hindi",
39 #if PANGO_VERSION <= 12005
40  "Times New Roman",
41 #else
42  "Times New Roman,", // Pango v1.36.2 requires a trailing ','
43 #endif
44  "UnBatang",
45  "Verdana"
46 };
47 
48 // Sample text used in tests.
49 const char kArabicText[] = "والفكر والصراع 1234,\nوالفكر والصراع";
50 const char kEngText[] = "the quick brown fox jumps over the lazy dog";
51 const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा";
52 const char kKorText[] = "이는 것으로";
53 // Hindi words containing illegal vowel sequences.
54 const char* kBadlyFormedHinWords[] = {
55 #if PANGO_VERSION <= 12005
56  "उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
57 #endif
58  // Pango v1.36.2 will render the above words even though they are invalid.
59  "प्रंात", nullptr
60 };
61 
62 class PangoFontInfoTest : public ::testing::Test {
63  protected:
64  void SetUp() override {
65  static std::locale system_locale("");
66  std::locale::global(system_locale);
67  }
68 
69  // Creates a fake fonts.conf file that points to the testdata fonts for
70  // fontconfig to initialize with.
71  static void SetUpTestCase() {
72  FLAGS_fonts_dir = TESTING_DIR;
73  FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
74 #ifdef GOOGLE_TESSERACT
75  FLAGS_use_only_legacy_fonts = false;
76 #endif
77  }
78 
79  PangoFontInfo font_info_;
80 };
81 
82 TEST_F(PangoFontInfoTest, TestNonDefaultConstructor) {
83  PangoFontInfo font("Arial Bold Italic 12");
84  EXPECT_EQ(12, font.font_size());
85  EXPECT_EQ("Arial", font.family_name());
86 }
87 
88 TEST_F(PangoFontInfoTest, DoesParseFontDescriptionName) {
89  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Bold Italic 12"));
90  EXPECT_EQ(12, font_info_.font_size());
91  EXPECT_EQ("Arial", font_info_.family_name());
92 
93  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Verdana 10"));
94  EXPECT_EQ(10, font_info_.font_size());
95  EXPECT_EQ("Verdana", font_info_.family_name());
96 
97  EXPECT_TRUE(font_info_.ParseFontDescriptionName("DejaVu Sans Ultra-Light"));
98  EXPECT_EQ("DejaVu Sans", font_info_.family_name());
99 }
100 
101 TEST_F(PangoFontInfoTest, DoesParseMissingFonts) {
102  // Font family one of whose faces exists but this one doesn't.
103  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Italic 12"));
104  EXPECT_EQ(12, font_info_.font_size());
105  EXPECT_EQ("Arial", font_info_.family_name());
106 
107  // Font family that doesn't exist in testdata. It will still parse the
108  // description name. But without the file, it will not be able to populate
109  // some font family details, like is_monospace().
110  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Georgia 10"));
111  EXPECT_EQ(10, font_info_.font_size());
112  EXPECT_EQ("Georgia", font_info_.family_name());
113 }
114 
115 TEST_F(PangoFontInfoTest, DoesGetSpacingProperties) {
116  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Italic 12"));
117  int x_bearing, x_advance;
118  EXPECT_TRUE(font_info_.GetSpacingProperties("A", &x_bearing, &x_advance));
119  EXPECT_GT(x_advance, 0);
120  EXPECT_TRUE(font_info_.GetSpacingProperties("a", &x_bearing, &x_advance));
121  EXPECT_GT(x_advance, 0);
122 }
123 
124 TEST_F(PangoFontInfoTest, CanRenderString) {
125  font_info_.ParseFontDescriptionName("Verdana 12");
126  EXPECT_TRUE(font_info_.CanRenderString(kEngText, strlen(kEngText)));
127 
128  font_info_.ParseFontDescriptionName("UnBatang 12");
129  EXPECT_TRUE(font_info_.CanRenderString(kKorText, strlen(kKorText)));
130 
131  font_info_.ParseFontDescriptionName("Lohit Hindi 12");
132  EXPECT_TRUE(font_info_.CanRenderString(kHinText, strlen(kHinText)));
133 }
134 
135 TEST_F(PangoFontInfoTest, CanRenderLigature) {
136  font_info_.ParseFontDescriptionName("Arab 12");
137  const char kArabicLigature[] = "لا";
138  EXPECT_TRUE(
139  font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));
140 
141  printf("Next word\n");
142  EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText)));
143 }
144 
145 TEST_F(PangoFontInfoTest, CannotRenderUncoveredString) {
146  font_info_.ParseFontDescriptionName("Verdana 12");
147  EXPECT_FALSE(font_info_.CanRenderString(kKorText, strlen(kKorText)));
148 }
149 
150 TEST_F(PangoFontInfoTest, CannotRenderInvalidString) {
151  font_info_.ParseFontDescriptionName("Lohit Hindi 12");
152  for (int i = 0; kBadlyFormedHinWords[i] != nullptr; ++i) {
153  EXPECT_FALSE(font_info_.CanRenderString(kBadlyFormedHinWords[i],
154  strlen(kBadlyFormedHinWords[i])))
155  << "Can render " << kBadlyFormedHinWords[i];
156  }
157 }
158 
159 TEST_F(PangoFontInfoTest, CanDropUncoveredChars) {
160  font_info_.ParseFontDescriptionName("Verdana 12");
161  // Verdana cannot render the "ff" ligature
162  std::string word = "office";
163  EXPECT_EQ(1, font_info_.DropUncoveredChars(&word));
164  EXPECT_EQ("oice", word);
165 
166  // Don't drop non-letter characters like word joiners.
167  const char* kJoiners[] = {
168  "\u2060", // U+2060 (WJ)
169  "\u200C", // U+200C (ZWJ)
170  "\u200D" // U+200D (ZWNJ)
171  };
172  for (size_t i = 0; i < ARRAYSIZE(kJoiners); ++i) {
173  word = kJoiners[i];
174  EXPECT_EQ(0, font_info_.DropUncoveredChars(&word));
175  EXPECT_STREQ(kJoiners[i], word.c_str());
176  }
177 }
178 
179 // ------------------------ FontUtils ------------------------------------
180 
181 class FontUtilsTest : public ::testing::Test {
182  protected:
183  // Creates a fake fonts.conf file that points to the testdata fonts for
184  // fontconfig to initialize with.
185  static void SetUpTestCase() {
186  FLAGS_fonts_dir = TESTING_DIR;
187  FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
188  }
189 
190  void CountUnicodeChars(const char* utf8_text,
191  std::unordered_map<char32, int64_t>* ch_map) {
192  ch_map->clear();
193  UnicodeText ut;
194  ut.PointToUTF8(utf8_text, strlen(utf8_text));
195  for (UnicodeText::const_iterator it = ut.begin(); it != ut.end(); ++it) {
196 #if 0
197  if (UnicodeProps::IsWhitespace(*it)) continue;
198 #else
199  if (std::isspace(*it)) continue;
200 #endif
201  ++(*ch_map)[*it];
202  }
203  }
204 };
205 
206 TEST_F(FontUtilsTest, DoesFindAvailableFonts) {
207  EXPECT_TRUE(FontUtils::IsAvailableFont("Arial Bold Italic"));
208  EXPECT_TRUE(FontUtils::IsAvailableFont("Verdana"));
209  EXPECT_TRUE(FontUtils::IsAvailableFont("DejaVu Sans Ultra-Light"));
210 
211  // Test that we can support font name convention for Pango v1.30.2 even when
212  // we are running an older version.
213  EXPECT_TRUE(FontUtils::IsAvailableFont("Times New Roman,"));
214 }
215 
216 TEST_F(FontUtilsTest, DoesDetectMissingFonts) {
217  // Only bold italic face is available.
218  EXPECT_FALSE(FontUtils::IsAvailableFont("Arial"));
219  // Don't have a ttf for the Courier family.
220  EXPECT_FALSE(FontUtils::IsAvailableFont("Courier"));
221  // Pango "synthesizes" the italic font from the available Verdana Regular and
222  // includes it in its list, but it is not really loadable.
223  EXPECT_FALSE(FontUtils::IsAvailableFont("Verdana Italic"));
224  // We have "Dejavu Sans Ultra-Light" but not its medium weight counterpart.
225  EXPECT_FALSE(FontUtils::IsAvailableFont("DejaVu Sans"));
226 }
227 
228 TEST_F(FontUtilsTest, DoesListAvailableFonts) {
229  const std::vector<std::string>& fonts = FontUtils::ListAvailableFonts();
230  EXPECT_THAT(fonts, ::testing::ElementsAreArray(kExpectedFontNames));
231  for (auto& font : fonts) {
232  PangoFontInfo font_info;
233  EXPECT_TRUE(font_info.ParseFontDescriptionName(font));
234  }
235 }
236 
237 TEST_F(FontUtilsTest, DoesFindBestFonts) {
238  std::string fonts_list;
239  std::unordered_map<char32, int64_t> ch_map;
240  CountUnicodeChars(kEngText, &ch_map);
241  EXPECT_EQ(26, ch_map.size()); // 26 letters
242  std::vector<std::pair<const char*, std::vector<bool> > > font_flags;
243  std::string best_list = FontUtils::BestFonts(ch_map, &font_flags);
244  EXPECT_TRUE(best_list.size());
245  // All fonts except Lohit Hindi should render English text.
246  EXPECT_EQ(ARRAYSIZE(kExpectedFontNames) - 1, font_flags.size());
247 
248  CountUnicodeChars(kKorText, &ch_map);
249  best_list = FontUtils::BestFonts(ch_map, &font_flags);
250  EXPECT_TRUE(best_list.size());
251  // Only UnBatang font family is able to render korean.
252  EXPECT_EQ(1, font_flags.size());
253  EXPECT_STREQ("UnBatang", font_flags[0].first);
254 }
255 
256 TEST_F(FontUtilsTest, DoesSelectFont) {
257  const char* kLangText[] = {kArabicText, kEngText, kHinText, kKorText, nullptr};
258  const char* kLangNames[] = {"Arabic", "English", "Hindi", "Korean", nullptr};
259  for (int i = 0; kLangText[i] != nullptr; ++i) {
260  SCOPED_TRACE(kLangNames[i]);
261  std::vector<std::string> graphemes;
262  std::string selected_font;
263  EXPECT_TRUE(FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]),
264  &selected_font, &graphemes));
265  EXPECT_TRUE(selected_font.size());
266  EXPECT_TRUE(graphemes.size());
267  }
268 }
269 
270 TEST_F(FontUtilsTest, DoesFailToSelectFont) {
271  const char kMixedScriptText[] = "पिताने विवाह की | والفكر والصراع";
272  std::vector<std::string> graphemes;
273  std::string selected_font;
274  EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText),
275  &selected_font, &graphemes));
276 }
277 
278 TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
279  const int32_t kHindiChar = 0x0905;
280  const int32_t kArabicChar = 0x0623;
281  const int32_t kMongolianChar = 0x180E; // Mongolian vowel separator
282  const int32_t kOghamChar = 0x1680; // Ogham space mark
283  std::vector<bool> unicode_mask;
284  FontUtils::GetAllRenderableCharacters(&unicode_mask);
285  EXPECT_TRUE(unicode_mask['A']);
286  EXPECT_TRUE(unicode_mask['1']);
287  EXPECT_TRUE(unicode_mask[kHindiChar]);
288  EXPECT_TRUE(unicode_mask[kArabicChar]);
289  EXPECT_FALSE(unicode_mask[kMongolianChar]); // no font for mongolian.
290 #if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
291  EXPECT_FALSE(unicode_mask[kOghamChar]); // no font for ogham.
292 #endif
293  unicode_mask.clear();
294 
295  std::vector<std::string> selected_fonts;
296  selected_fonts.push_back("Lohit Hindi");
297  FontUtils::GetAllRenderableCharacters(selected_fonts, &unicode_mask);
298  EXPECT_TRUE(unicode_mask['1']);
299  EXPECT_TRUE(unicode_mask[kHindiChar]);
300  EXPECT_FALSE(unicode_mask['A']); // Lohit doesn't render English,
301  EXPECT_FALSE(unicode_mask[kArabicChar]); // or Arabic,
302  EXPECT_FALSE(unicode_mask[kMongolianChar]); // or Mongolian,
303  EXPECT_FALSE(unicode_mask[kOghamChar]); // or Ogham.
304  unicode_mask.clear();
305 
306  // Check that none of the included fonts cover the Mongolian or Ogham space
307  // characters.
308  for (size_t f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) {
309  SCOPED_TRACE(absl::StrCat("Testing ", kExpectedFontNames[f]));
310  FontUtils::GetAllRenderableCharacters(kExpectedFontNames[f], &unicode_mask);
311 #if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
312  EXPECT_FALSE(unicode_mask[kOghamChar]);
313 #endif
314  EXPECT_FALSE(unicode_mask[kMongolianChar]);
315  unicode_mask.clear();
316  }
317 }
318 } // namespace
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::IsWhitespace
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:239
ARRAYSIZE
#define ARRAYSIZE(arr)
Definition: include_gunit.h:53
DECLARE_BOOL_PARAM_FLAG
DECLARE_BOOL_PARAM_FLAG(use_only_legacy_fonts)
include_gunit.h
tesseract::TEST_F
TEST_F(EquationFinderTest, IdentifySpecialText)
Definition: equationdetect_test.cc:181
fileio.h
tesseract::PangoFontInfo
Definition: pango_font_info.h:39
unicodetext.h
FLAGS_test_tmpdir
const char * FLAGS_test_tmpdir
Definition: include_gunit.h:20
pango_font_info.h
UnicodeText
Definition: unicodetext.h:116
DECLARE_STRING_PARAM_FLAG
DECLARE_STRING_PARAM_FLAG(fonts_dir)
UnicodeText::PointToUTF8
UnicodeText & PointToUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:256
UnicodeText::end
const_iterator end() const
Definition: unicodetext.cc:412
tesseract::FontUtils
Definition: pango_font_info.h:145
commandlineflags.h
tesseract::File
Definition: fileio.h:55
UnicodeText::const_iterator
Definition: unicodetext.h:176
UnicodeText::begin
const_iterator begin() const
Definition: unicodetext.cc:408