tesseract  5.0.0-alpha-619-ge9db
normstrngs_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include "absl/strings/str_format.h" // for absl::StrFormat
13 #include "include_gunit.h"
14 #include "normstrngs.h"
15 #include "normstrngs_test.h"
16 #include <tesseract/strngs.h>
17 #include <tesseract/unichar.h>
18 #ifdef INCLUDE_TENSORFLOW
19 #include "util/utf8/unilib.h" // for UniLib
20 #endif
21 
22 #include "include_gunit.h"
23 
24 namespace tesseract {
25 namespace {
26 
27 #if defined(MISSING_CODE)
28 static std::string EncodeAsUTF8(const char32 ch32) {
29  UNICHAR uni_ch(ch32);
30  return std::string(uni_ch.utf8(), uni_ch.utf8_len());
31 }
32 #endif
33 
34 TEST(NormstrngsTest, BasicText) {
35  const char* kBasicText = "AbCd Ef";
36  std::string result;
38  GraphemeNorm::kNormalize, kBasicText,
39  &result));
40  EXPECT_STREQ(kBasicText, result.c_str());
41 }
42 
43 TEST(NormstrngsTest, LigatureText) {
44  const char* kTwoByteLigText = "ij"; // U+0133 (ij) -> ij
45  std::string result;
47  GraphemeNorm::kNormalize, kTwoByteLigText,
48  &result));
49  EXPECT_STREQ("ij", result.c_str());
50 
51  const char* kThreeByteLigText = "finds"; // U+FB01 (fi) -> fi
53  GraphemeNorm::kNormalize, kThreeByteLigText,
54  &result));
55  EXPECT_STREQ("finds", result.c_str());
56 }
57 
58 TEST(NormstrngsTest, OcrSpecificNormalization) {
59  const char* kSingleQuoteText = "‘Hi"; // U+2018 (‘) -> U+027 (')
60  std::string result;
62  GraphemeNorm::kNormalize, kSingleQuoteText,
63  &result));
64  EXPECT_STREQ("'Hi", result.c_str());
65 
66  const char* kDoubleQuoteText = "“Hi"; // U+201C (“) -> U+022 (")
68  GraphemeNorm::kNormalize, kDoubleQuoteText,
69  &result));
70  EXPECT_STREQ("\"Hi", result.c_str());
71 
72  const char* kEmDash = "Hi—"; // U+2014 (—) -> U+02D (-)
74  GraphemeNorm::kNormalize, kEmDash, &result));
75  EXPECT_STREQ("Hi-", result.c_str());
76  // Without the ocr normalization, these changes are not made.
78  GraphemeNorm::kNormalize, kSingleQuoteText,
79  &result));
80  EXPECT_STREQ(kSingleQuoteText, result.c_str());
82  GraphemeNorm::kNormalize, kDoubleQuoteText,
83  &result));
84  EXPECT_STREQ(kDoubleQuoteText, result.c_str());
86  GraphemeNorm::kNormalize, kEmDash, &result));
87  EXPECT_STREQ(kEmDash, result.c_str());
88 }
89 
90 // Sample text used in tests.
91 const char kEngText[] = "the quick brown fox jumps over the lazy dog";
92 const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा";
93 const char kKorText[] = "이는 것으로";
94 // Hindi words containing illegal vowel sequences.
95 const char* kBadlyFormedHinWords[] = {"उपयोक्ताो", "नहीें", "प्रंात",
96  "कहीअे", "पत्रिाका", "छह्णाीस"};
97 // Thai illegal sequences.
98 const char* kBadlyFormedThaiWords[] = {"ฤิ", "กา้ํ", "กิำ", "นำ้", "เเก"};
99 
100 TEST(NormstrngsTest, DetectsCorrectText) {
101  std::string chars;
103  GraphemeNorm::kNormalize, kEngText, &chars));
104  EXPECT_STREQ(kEngText, chars.c_str());
105 
107  GraphemeNorm::kNormalize, kHinText, &chars))
108  << "Incorrect text: '" << kHinText << "'";
109  EXPECT_STREQ(kHinText, chars.c_str());
110 
112  GraphemeNorm::kNormalize, kKorText, &chars));
113  EXPECT_STREQ(kKorText, chars.c_str());
114 }
115 
116 TEST(NormstrngsTest, DetectsIncorrectText) {
117  for (size_t i = 0; i < ARRAYSIZE(kBadlyFormedHinWords); ++i) {
120  kBadlyFormedHinWords[i], nullptr))
121  << kBadlyFormedHinWords[i];
122  }
123  for (size_t i = 0; i < ARRAYSIZE(kBadlyFormedThaiWords); ++i) {
126  kBadlyFormedThaiWords[i], nullptr))
127  << kBadlyFormedThaiWords[i];
128  }
129 }
130 
131 TEST(NormstrngsTest, NonIndicTextDoesntBreakIndicRules) {
132  std::string nonindic = "Here's some latin text.";
135  GraphemeNorm::kNormalize, nonindic.c_str(),
136  &dest))
137  << PrintString32WithUnicodes(nonindic);
138  EXPECT_EQ(dest, nonindic);
139 }
140 
141 TEST(NormstrngsTest, NoLonelyJoiners) {
142  std::string str = "x\u200d\u0d06\u0d34\u0d02";
143  std::vector<std::string> glyphs;
144  // Returns true, but the joiner is gone.
145  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
147  str.c_str(), &glyphs))
149  EXPECT_EQ(glyphs.size(), 3);
150  EXPECT_EQ(glyphs[0], std::string("x"));
151  EXPECT_EQ(glyphs[1], std::string("\u0d06"));
152  EXPECT_EQ(glyphs[2], std::string("\u0d34\u0d02"));
153 }
154 
155 TEST(NormstrngsTest, NoLonelyJoinersPlus) {
156  std::string str = "\u0d2a\u200d+\u0d2a\u0d4b";
157  std::vector<std::string> glyphs;
158  // Returns true, but the joiner is gone.
159  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
161  str.c_str(), &glyphs))
163  EXPECT_EQ(glyphs.size(), 3);
164  EXPECT_EQ(glyphs[0], std::string("\u0d2a"));
165  EXPECT_EQ(glyphs[1], std::string("+"));
166  EXPECT_EQ(glyphs[2], std::string("\u0d2a\u0d4b"));
167 }
168 
169 TEST(NormstrngsTest, NoLonelyJoinersNonAlpha) {
170  std::string str = "\u200d+\u200c\u200d";
171  // Returns true, but the joiners are gone.
173  str = "\u200d\u200c\u200d";
174  // Without the plus, the string is invalid.
175  std::string result;
177  GraphemeNorm::kNormalize, str.c_str(),
178  &result))
179  << PrintString32WithUnicodes(result);
180 }
181 
182 TEST(NormstrngsTest, JoinersStayInArabic) {
183  std::string str = "\u0628\u200c\u0628\u200d\u0628";
184  // Returns true, string untouched.
186 }
187 
188 TEST(NormstrngsTest, DigitOK) {
189  std::string str = "\u0cea"; // Digit 4.
191 }
192 
193 TEST(NormstrngsTest, DandaOK) {
194  std::string str = "\u0964"; // Single danda.
196  str = "\u0965"; // Double danda.
198 }
199 
200 TEST(NormstrngsTest, AllScriptsRegtest) {
201  // Tests some valid text in a large number of scripts, some of which were
202  // found to be rejected by an earlier version.
203  const std::vector<std::pair<std::string, std::string>> kScriptText(
204  {{"Arabic",
205  " فكان منهم علقمة بن قيس ، و إبراهيم النخعي ، و الأسود بن"
206  "توفي بالمدينة في هذه السنة وهو ابن مائة وعشرين سنة "
207  "مجموعه هیچ اثری در فنون هنر و ادب و ترجمه، تقدیم پیشگاه ارجمند "
208  "سازنده تاریخ نگاه میکرد و به اصطلاح انسان و فطرت انسانی را زیربنای"},
209  {"Armenian",
210  "անտիկ աշխարհի փիլիսոփաների կենսագրությունը, թե′ նրանց ուս-"
211  "պատրաստւում է դալ (բուլամա): Կովկասում կաթից նաև պատ-"
212  "Հոգաբարձութեան յղել այդ անձին յիմարութիւնը հաստա-"
213  "գծերը եւ միջագծերը կը համրուին վարէն վեր:"},
214  {"Bengali",
215  "এসে দাঁড়ায় দাও খানি উঁচিয়ে নিয়ে । ঝরনার স্বচ্ছ জলে প্রতিবিম্বিত "
216  "পাঠিয়ে, গোবিন্দ স্মরণ করে, নির্ভয়ে রওনা হয়েছিল। তাতে সে "
217  "সুলতার। মনে পড়ে বিয়ের সময় বাবা এদের বাড়ি থেকে ঘুরে "
218  "কিন্তু তারপর মাতৃহৃদয় কেমন করে আছে? কী"},
219  {"Cyrillic",
220  "достей, є ще нагороди й почесті, є хай і сумнівна, але слава, "
221  "вып., 96б). Параўн. найсвятший у 1 знач., насвятейший у 1 знач., "
222  "»Правді«, — гітлерівські окупанти винищували нижчі раси, після дру- "
223  "І знов майдан зачорнів од народу. Всередині чоло-"},
224  {"Devanagari",
225  "डा॰ नै हात्तीमाथि चढेर त्यो भएनेर आइपुगे। राजालाई देखी "
226  "बाबतीत लिहिणे ही एक मोठीच जबाबदारी आहे. काकासाहेबांच्या कार्याचा "
227  "प्रबंध, आधोगिक प्रबंध तथा बैंकिंग एवम वाणिज्य आदि विषयों में "
228  "चित्रकृती दिल्या. शंभराहून अधिक देश आज आपापले चित्रपट निर्माण करीत"},
229  {"Greek",
230  "Μέσα ένα τετράδιο είχα στριμώξει το πρώτο "
231  "νον αξίως τού ευαγγελίου τού χριστού πολιτεύεσθε, ίνα "
232  "οὐδεμία ὑπ' αὐτοῦ μνεία γίνεται τῶν οἰκείων χωρίων. "
233  "είτα την φάσιν αυτήν ην ούτος εποιήσατο κατά του Μίκω-"},
234  {"Gujarati",
235  "ઉપહારગૃહે ને નાટ્યસ્થળે આ એ જ તેલ કડકડતું "
236  "શકી. ભાવવધારો અટકાવી નથી શકી અને બેકારીને "
237  "ત્યાં વાંકુથી પાછે આવ્યો, ચોરીનો માલ સોંપવા ! "
238  "કહી. એણે રેશમના કપડામાં વીંટી રાખેલ કુંવરીની છબી"},
239  {"Gurmukhi",
240  "ਯਾਦ ਰਹੇ ਕਿ ‘ਨਫਰਤ ’ ਦਾ ਵਿਸ਼ਾ ਕ੍ਰਾਤੀ ਨਹੀ ਹੈ ਅਤੇ ਕਵੀ ਦੀ ਇਹ "
241  "ਮਹਾਂ ਨੰਦਾ ਕੋਲ ਇਕ ਚੀਜ਼ ਸੀ ਉਹ ਸੀ ਸਚ, ਕੋਰਾ ਸਚ, ਬੇਧਤ੍ਰਕ ਕਹਿੳ "
242  "ਭੂਰਾ ਸਾਨੂੰ ਥੜਾ ਚੰਗਾ ਲਗਦਾ ਸੀ । ਉਸ ਦਾ ਇਕ ਪੈਰ ਜਨਮ ਤੋ "
243  "ਨੂੰ ਇਹ ਅਧਿਕਾਰ ਦਿੱਤਾ ਕਿ ਉਹ ਸਿੱਖ ਵਿਰੋਧ ਦਾ ਸੰਗਠਨ ਕਰੇ ਅਤੇ 3 ਸਤੰਬਰ,"},
244  {"Hangul",
245  "로 들어갔다. 이대통령은 아이젠하워 대통령의 뒷모습을 보면서 "
246  "그것뿐인 줄 아요? 노름도 했다 캅니다. 빌어묵을 놈이 그러 "
247  "의 가장 과학적 태도이며, 우리 역사를 가장 정확하게 학습할 수 있는 "
248  "마르크스 레"
249  "각하는 그는 그들의 식사보장을 위해 때때로 집에"},
250  {"HanS",
251  "大凡世界上的先生可 分 三 种: 第一种只会教书, 只会拿一 "
252  "书像是探宝一样,在茶叶店里我买过西湖龙井﹑黄山毛峰﹑福建的铁观音﹑大红"
253  " "
254  "持 “左” 倾冒险主义的干部,便扣上 “富农 "
255  "笑说:“我听说了,王总工程师也跟我说过了,只是工作忙,谁"},
256  {"HanT",
257  "叁、 銀行資產管理的群組分析模式 "
258  "民國六十三年,申請就讀台灣大學歷史研究所,並從事著述,"
259  "質言之﹐在社會結構中﹐性質﹑特徵﹑地位相類似的一羣人﹐由於 "
260  "董橋,一九四二年生,福建晉江人,國立成功大學外"},
261  {"Hebrew",
262  " אֵ-לִי, אֵ-לִי, כֵּיַצד מְטַפְּסִים בְּקִירוֹת שֶׁל זְכוּכִי"
263  " הראשון חוצה אותי שוב. אני בסיבוב הרביעי, הוא בטח מתחיל את"
264  " ווערטער געהאט, אבער דער עיקר איז ניט דאָס וואָרט, נאָר"
265  " על גחלת היהדות המקורית בעירך, נתת צביון ואופי מיוחד"},
266  {"Japanese",
267  "は異民族とみなされていた。楚の荘王(前613〜前 "
268  "を詳細に吟味する。実際の治療活動の領域は便宜上、(1) 障害者 "
269  "困難性は多角企業の場合原則として部門別に判断されている.). "
270  "☆ご希望の団体には見本をお送りします"},
271  {"Kannada",
272  "ಕೂಡ ಯುದ್ಧ ಮಾಡಿ ಜಯಪಡೆ. ನಂತರ ನಗರದೊಳಕ್ಕೆ ನಡೆ ಇದನ್ನು "
273  "ಅಸಹ್ಯದೃಶ್ಯ ಯಾರಿಗಾದರೂ ನಾಚಿಕೆತರುವಂತಹದಾಗಿದೆ. ಆರೋಗ್ಯ ದೃಷ್ಟಿ "
274  "ಯಾಗಲಿ, ಮೋಹನನಾಗಲಿ ಇಂಥ ಬಿಸಿಲಿನಲ್ಲಿ ಎಂದೂ ಬಹಳ ಹೊತ್ತು "
275  "\"ಇದೆ...ಖಂಡಿತಾ ಇದೆ\" ಅಂದ ಮನಸ್ಸಿನಲ್ಲಿಯೇ ವಂದಿಸುತ್ತಾ,"},
276  {"Khmer",
277  "សិតសក់និងផ្លាស់សម្លៀកបំពាក់ពេលយប់ចេញ។ "
278  "និយាយអំពីនគរនេះ ប្រាប់ដល់លោកទាំងមូលឲ្យដឹងច្បាស់លាស់អំពី "
279  "កន្លះកាថាសម្រាប់ទន្ទេញឲ្យងាយចាំ បោះពុម្ពនៅក្នុងទ្រង់ទ្រាយបច្ចុប្បន្ន "
280  "ឯកសារនេះបានផ្សព្វផ្សាយនៅក្នុងសន្និសីទ"},
281  {"Lao",
282  "ເອີຍ ! ຟັງສຽງຟ້າມັນຮ້ອງຮ່ວນ ມັນດັງໄກໆ ເອີຍ "
283  "ໄດລຽງດູລາວມາດວບຄວາມລາບາກຫລາຍ; "
284  "ບາງໄດ້ ເຈົ້າລອງສູ້ບໍ່ໄດ້ຈຶ່ງຫນີລົງມາວຽງຈັນ. "
285  "ລົບອອກຈາກ 3 ເຫລືອ 1, ຂ້ອຍຂຽນ 1 (1)"},
286  {"Latin",
287  "režisoru, palīdzēja to manu domīgo, kluso Dzejas metru ielikt "
288  "Ešte nedávno sa chcel mladý Novomeský „liečiť” "
289  "tiivisia kysymyksiä, mistä seuraa, että spekula- | don luonteesta "
290  "Grabiel Sanchez, yang bertani selama 120 tahun meninggal"},
291  {"Malayalam",
292  "അമൂർത്തചിത്രമായിരിക്കും. ഛേ! ആ വീട്ടിലേക്ക് അവളൊന്നിച്ച് പോകേണ്ടതാ "
293  "മൃഗങ്ങൾക്ക് എന്തെക്കിലും പറ്റിയാൽ മാത്രം ഞാനതു "
294  "വെലക്ക് വേണമെങ്കിൽ തരാം. എന്തോ തരും? പറ. "
295  "എല്ലാം കഴിഞ്ഞ് സീനിയറിന്റെ അടുത്തു ചെന്ന് കാൽതൊട്ട"},
296  {"Tamil",
297  "பொருத்தமாகப் பாடினாள் நம் ஔவைப் பாட்டி. காவிரி "
298  "உள்ளடக்கி நிற்பது விநோத வார்த்தையின் அஃறிணை "
299  "சூரிய கிரஹண சமயத்தில் குருக்ஷேத்திரம் செல்வது "
300  "காலங்களில் வெளியே போகும்பொழுது, 'ஸார்', 'ஸார்',"},
301  {"Telugu",
302  "1892లో ఆమె 10వ సంవత్సరంలో గుంటూరు తాలూకా వేములాపాడు "
303  "ఫండ్స్ చట్టము'నందు చేయబడెను. తరువాత క్రీ. శ. "
304  "సంచారము చేయును. మీరు ఇప్పుడే కాళకాలయమునకు "
305  "ఎంతటి సరళమైన భాషలో వ్రాశాడో విశదమవుతుంది. పైగా ఆనాటి భాష"},
306  {"Thai",
307  "อ้อ! กับนัง....แม่ยอดพระกลิ่น นั่นเอง ! หรับก็ย่อมจะรู้โดยชัดเจนว่า "
308  "ถ้าตราบใดยังมีเรือปืนอยู่ใกล้ ๆ แล้ว ตราบนั้น "
309  "พระดำรินี้ ที่มีคตีทำกรวยหมากและธูปเทียน "
310  "อันยานมีเรือเปนต้นฃ้ามยาก ฯ เพราะว่าแม่น้ำนั่นมีน้ำใสยิ่ง แม้เพียง"},
311  {"Vietnamese",
312  "vợ đến tai mụ hung thần Xăng-tô- mê-a. Mụ vô cùng "
313  "chiếc xe con gấu chạy qua nhà. Nhưng thỉnh thoảng "
314  "hòa hoãn với người Pháp để cho họ được dựng một ngôi nhà thờ nhỏ bằng "
315  "Cặp câu đói súc tích mà sâu sắc, là lời chúc lời"}});
316 
317  for (const auto& p : kScriptText) {
318  std::string normalized;
319  EXPECT_TRUE(tesseract::NormalizeUTF8String(
321  tesseract::GraphemeNorm::kNormalize, p.second.c_str(), &normalized))
322  << "Script=" << p.first << " text=" << p.second;
323  }
324 }
325 
326 TEST(NormstrngsTest, IsWhitespace) {
327  // U+0020 is whitespace
328  EXPECT_TRUE(IsWhitespace(' '));
329  EXPECT_TRUE(IsWhitespace('\t'));
330  EXPECT_TRUE(IsWhitespace('\r'));
331  EXPECT_TRUE(IsWhitespace('\n'));
332  // U+2000 through U+200A
333  for (char32 ch = 0x2000; ch <= 0x200A; ++ch) {
334  SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
335  EXPECT_TRUE(IsWhitespace(ch));
336  }
337  // U+3000 is whitespace
338  EXPECT_TRUE(IsWhitespace(0x3000));
339  // ZWNBSP is not considered a space.
340  EXPECT_FALSE(IsWhitespace(0xFEFF));
341 }
342 
343 TEST(NormstrngsTest, SpanUTF8Whitespace) {
344  EXPECT_EQ(4, SpanUTF8Whitespace(" \t\r\n"));
345  EXPECT_EQ(4, SpanUTF8Whitespace(" \t\r\nabc"));
346  EXPECT_EQ(0, SpanUTF8Whitespace("abc \t\r\nabc"));
347  EXPECT_EQ(0, SpanUTF8Whitespace(""));
348 }
349 
350 TEST(NormstrngsTest, SpanUTF8NotWhitespace) {
351  const char kHinText[] = "पिताने विवाह";
352  const char kKorText[] = "이는 것으로 다시 넣을";
353  const char kMixedText[] = "والفكر 123 والصراع abc";
354 
355  EXPECT_EQ(0, SpanUTF8NotWhitespace(""));
356  EXPECT_EQ(0, SpanUTF8NotWhitespace(" abc"));
357  EXPECT_EQ(0, SpanUTF8NotWhitespace("\rabc"));
358  EXPECT_EQ(0, SpanUTF8NotWhitespace("\tabc"));
359  EXPECT_EQ(0, SpanUTF8NotWhitespace("\nabc"));
360  EXPECT_EQ(3, SpanUTF8NotWhitespace("abc def"));
361  EXPECT_EQ(18, SpanUTF8NotWhitespace(kHinText));
362  EXPECT_EQ(6, SpanUTF8NotWhitespace(kKorText));
363  EXPECT_EQ(12, SpanUTF8NotWhitespace(kMixedText));
364 }
365 
366 // Test that the method clones the util/utf8/public/unilib definition of
367 // interchange validity.
368 TEST(NormstrngsTest, IsInterchangeValid) {
369 #ifdef INCLUDE_TENSORFLOW
370  const int32_t kMinUnicodeValue = 33;
371  const int32_t kMaxUnicodeValue = 0x10FFFF;
372  for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
373  SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
375  }
376 #else
377  GTEST_SKIP();
378 #endif
379 }
380 
381 // Test that the method clones the util/utf8/public/unilib definition of
382 // 7-bit ASCII interchange validity.
383 TEST(NormstrngsTest, IsInterchangeValid7BitAscii) {
384 #if defined(MISSING_CODE) && defined(INCLUDE_TENSORFLOW)
385  const int32_t kMinUnicodeValue = 33;
386  const int32_t kMaxUnicodeValue = 0x10FFFF;
387  for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
388  SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
389  std::string str = EncodeAsUTF8(ch);
392  }
393 #else
394  // Skipped because of missing UniLib::IsInterchangeValid7BitAscii.
395  GTEST_SKIP();
396 #endif
397 }
398 
399 // Test that the method clones the util/utf8/public/unilib definition of
400 // fullwidth-halfwidth .
401 TEST(NormstrngsTest, FullwidthToHalfwidth) {
402  // U+FF21 -> U+0041 (Latin capital letter A)
403  EXPECT_EQ('A', FullwidthToHalfwidth(0xFF21));
404  // U+FF05 -> U+0025 (percent sign)
405  EXPECT_EQ('%', FullwidthToHalfwidth(0xFF05));
406  // U+FFE6 -> U+20A9 (won sign)
407  EXPECT_EQ(0x20A9, FullwidthToHalfwidth(0xFFE6));
408 
409 #if defined(MISSING_CODE) && defined(INCLUDE_TENSORFLOW)
410  // Skipped because of missing UniLib::FullwidthToHalfwidth.
411  const int32_t kMinUnicodeValue = 33;
412  const int32_t kMaxUnicodeValue = 0x10FFFF;
413  for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
414  if (!IsValidCodepoint(ch)) continue;
415  SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
416  std::string str = EncodeAsUTF8(ch);
417  const std::string expected_half_str =
418  UniLib::FullwidthToHalfwidth(str.c_str(), str.length(), true);
419  EXPECT_EQ(expected_half_str, EncodeAsUTF8(FullwidthToHalfwidth(ch)));
420  }
421 #endif
422 }
423 
424 } // namespace
425 } // namespace tesseract
tesseract::OCRNorm::kNone
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::NormalizeUTF8String
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:163
strngs.h
tesseract::IsWhitespace
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:239
tesseract::SpanUTF8Whitespace
unsigned int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:249
normstrngs_test.h
tesseract::IsInterchangeValid
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:269
tesseract::NormalizeCleanAndSegmentUTF8
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:188
tesseract::IsValidCodepoint
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:234
tesseract::IsInterchangeValid7BitAscii
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:292
ARRAYSIZE
#define ARRAYSIZE(arr)
Definition: include_gunit.h:53
include_gunit.h
tesseract::FullwidthToHalfwidth
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:298
tesseract::PrintString32WithUnicodes
std::string PrintString32WithUnicodes(const std::string &str)
Definition: normstrngs_test.h:34
tesseract::UnicodeNormMode::kNFKC
tesseract
Definition: baseapi.h:65
tesseract::ExpectGraphemeModeResults
void ExpectGraphemeModeResults(const std::string &str, UnicodeNormMode u_mode, int unicode_count, int glyph_count, int grapheme_count, const std::string &target_str)
Definition: normstrngs_test.h:48
tesseract::SpanUTF8NotWhitespace
unsigned int SpanUTF8NotWhitespace(const char *text)
Definition: normstrngs.cpp:259
normstrngs.h
unichar.h
UniLib::IsInterchangeValid
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33
tesstrain_utils.dest
dest
Definition: tesstrain_utils.py:139
tesseract::UnicodeNormMode::kNFC
char32
signed int char32
Definition: pango_font_info.h:33
tesseract::GraphemeNormMode::kCombined
unilib.h
tesseract::OCRNorm::kNormalize