tesseract  5.0.0-alpha-619-ge9db
validate_grapheme_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include "include_gunit.h"
13 #include "normstrngs.h"
14 #include "normstrngs_test.h"
15 
16 namespace tesseract {
17 namespace {
18 
19 TEST(ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) {
20  std::string str = "\u0c15\u0c3f\u0c15\u0c0e"; // KA - dep I - KA - ind E.
21  std::vector<std::string> glyphs;
22  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
24  str.c_str(), &glyphs))
26  // It made 3 graphemes.
27  EXPECT_EQ(glyphs.size(), 3);
28  EXPECT_EQ(glyphs[0], std::string("\u0c15\u0c3f"));
29  EXPECT_EQ(glyphs[1], std::string("\u0c15"));
30  EXPECT_EQ(glyphs[2], std::string("\u0c0e"));
31 }
32 
33 TEST(ValidateGraphemeTest, SingleConsonantOK) {
34  std::string str = "\u0cb9"; // HA
35  std::vector<std::string> glyphs;
36  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
38  str.c_str(), &glyphs))
40  EXPECT_EQ(glyphs.size(), 1);
41  EXPECT_EQ(glyphs[0], str);
42 }
43 
44 TEST(ValidateGraphemeTest, SimpleCV) {
45  std::string str = "\u0cb9\u0cbf"; // HA I
46  std::vector<std::string> glyphs;
47  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
49  str.c_str(), &glyphs))
51  EXPECT_EQ(glyphs.size(), 1);
52  EXPECT_EQ(glyphs[0], str);
53 }
54 
55 TEST(ValidateGraphemeTest, SubscriptConjunct) {
56  std::string str = "\u0cb9\u0ccd\u0c95\u0cbf"; // HA Virama KA I
57  std::vector<std::string> glyphs;
58  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
60  str.c_str(), &glyphs))
62  EXPECT_EQ(glyphs.size(), 1);
63  EXPECT_EQ(glyphs[0], str);
64  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
66  true, str.c_str(), &glyphs))
68  EXPECT_EQ(glyphs.size(), 3);
69  EXPECT_EQ(glyphs[1], std::string("\u0ccd\u0c95"));
70 }
71 
72 TEST(ValidateGraphemeTest, HalfFormJoiner) {
73  std::string str = "\u0d15\u0d4d\u200d\u0d24"; // KA Virama ZWJ Ta
74  std::vector<std::string> glyphs;
75  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
77  str.c_str(), &glyphs))
79  EXPECT_EQ(glyphs.size(), 1);
80  EXPECT_EQ(glyphs[0], str);
81  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
83  true, str.c_str(), &glyphs))
85  EXPECT_EQ(glyphs.size(), 2) << PrintStringVectorWithUnicodes(glyphs);
86  EXPECT_EQ(glyphs[0], std::string("\u0d15\u0d4d\u200d"));
87 }
88 
89 TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) {
90  std::string str = "\u0d15\u200d\u0d4d\u0d24"; // KA ZWI Virama Ta
91  std::vector<std::string> glyphs;
92  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
94  str.c_str(), &glyphs))
96  EXPECT_EQ(glyphs.size(), 1);
97  EXPECT_EQ(glyphs[0], str);
98  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
100  true, str.c_str(), &glyphs))
102  EXPECT_EQ(glyphs.size(), 3);
103  EXPECT_EQ(glyphs[1], std::string("\u200d\u0d4d"));
104 }
105 
106 TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
107  std::string str = "\u0d15\u200c\u0d4d\u0d24"; // KA ZWNJ Virama Ta
108  std::vector<std::string> glyphs;
109  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
111  str.c_str(), &glyphs))
113  EXPECT_EQ(glyphs.size(), 1);
114  EXPECT_EQ(glyphs[0], str);
115  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
117  true, str.c_str(), &glyphs))
119  EXPECT_EQ(glyphs.size(), 3);
120  EXPECT_EQ(glyphs[1], std::string("\u200c\u0d4d"));
121  // Malaylam only, so not allowed in Telugu.
122  str = "\u0c15\u200c\u0c4d\u0c24"; // KA ZWNJ Virama Ta
123  EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
125  str.c_str(), &glyphs))
127 }
128 
129 TEST(ValidateGraphemeTest, ExplicitViramaNonJoiner) {
130  std::string str = "\u0d15\u0d4d\u200c\u0d24"; // KA Virama ZWNJ Ta
131  std::vector<std::string> glyphs;
132  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
134  str.c_str(), &glyphs))
136  EXPECT_EQ(glyphs.size(), 2);
137  EXPECT_EQ(glyphs[1], std::string("\u0d24"));
138  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
140  true, str.c_str(), &glyphs))
142  EXPECT_EQ(glyphs.size(), 3);
143  EXPECT_EQ(glyphs[1], std::string("\u0d4d\u200c"));
144 }
145 
146 TEST(ValidateGraphemeTest, ThaiGraphemes) {
147  // This is a single grapheme unless in glyph split mode
148  std::string str = "\u0e14\u0e38\u0e4a";
149  std::vector<std::string> glyphs;
150  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
152  str.c_str(), &glyphs))
154  EXPECT_EQ(glyphs.size(), 1);
155  EXPECT_EQ(glyphs[0], str);
156  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
158  true, str.c_str(), &glyphs))
160  EXPECT_EQ(glyphs.size(), 3);
161  EXPECT_EQ(glyphs[0], std::string("\u0e14"));
162 }
163 
164 TEST(ValidateGraphemeTest, NoLonelyJoinersQuote) {
165  std::string str = "'\u0d24\u0d23\u0d32\u0d4d'\u200d";
166  std::vector<std::string> glyphs;
167  // Returns true, but the joiner is gone.
168  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
170  str.c_str(), &glyphs))
172  EXPECT_EQ(glyphs.size(), 5);
173  EXPECT_EQ(glyphs[0], std::string("'"));
174  EXPECT_EQ(glyphs[1], std::string("\u0d24"));
175  EXPECT_EQ(glyphs[2], std::string("\u0d23"));
176  EXPECT_EQ(glyphs[3], std::string("\u0d32\u0d4d\u200c"));
177  EXPECT_EQ(glyphs[4], std::string("'"));
178 }
179 
180 } // namespace
181 } // namespace tesseract
tesseract::OCRNorm::kNone
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::PrintStringVectorWithUnicodes
std::string PrintStringVectorWithUnicodes(const std::vector< std::string > &glyphs)
Definition: normstrngs_test.h:39
normstrngs_test.h
tesseract::NormalizeCleanAndSegmentUTF8
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:188
include_gunit.h
tesseract::GraphemeNormMode::kGlyphSplit
tesseract::PrintString32WithUnicodes
std::string PrintString32WithUnicodes(const std::string &str)
Definition: normstrngs_test.h:34
tesseract
Definition: baseapi.h:65
normstrngs.h
tesseract::UnicodeNormMode::kNFC
tesseract::GraphemeNormMode::kCombined