tesseract  5.0.0-alpha-619-ge9db
validate_indic_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include "include_gunit.h"
13 #include "normstrngs.h"
14 #include "normstrngs_test.h"
15 
16 namespace tesseract {
17 namespace {
18 
19 // Though the unicode example for Telugu in section 12.7:
20 // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
21 // shows using ZWNJ to force an explicit virama, in practice a ZWNJ is used to
22 // suppress a conjugate that would otherwise occur. If a consonant is followed
23 // by a virama and then by a non-Indic character, OpenType will presume that
24 // the user simply meant to suppress the inherent vowel of the consonant
25 // and render it as the consonant with an explicit virama, the same as if
26 // a ZWNJ had followed. Since this is confusing to an OCR engine, the
27 // normalizer always puts a termninating ZWNJ on the end if not present,
28 // and accepts the string as valid.
29 TEST(ValidateIndicTest, AddsJoinerToTerminalVirama) {
30  std::string str = "\u0c15\u0c4d"; // KA - virama
31  std::string target_str = "\u0c15\u0c4d\u200c"; // KA - virama - ZWNJ
32  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 3, 2, 1, target_str);
33  // Same result if we started with the normalized string.
35  target_str);
36 }
37 
38 // Only one dependent vowel is allowed.
39 TEST(ValidateIndicTest, OnlyOneDependentVowel) {
40  std::string str = "\u0d15\u0d3e\u0d42"; // KA AA UU
43  GraphemeNorm::kNormalize, str.c_str(),
44  &dest))
46 }
47 
48 // [c26][c4d][c01]
49 // A consonant (DA) followed by the virama followed by a bindu
50 // Syllable modifiers [c01][c02][c03] all modify the pronunciation of
51 // the vowel in a syllable, as does the virama [c04]. You can only
52 // have one of these on a syllable.
53 //
54 // References:
55 // http://www.omniglot.com/writing/telugu.htm
56 TEST(ValidateIndicTest, OnlyOneVowelModifier) {
57  std::string str = "\u0c26\u0c4d\u0c01"; // DA virama candrabindu
58  std::string result;
60  GraphemeNorm::kNormalize, str.c_str(),
61  &result));
62  // It made 1 grapheme of 4 chars, by terminating the explicit virama.
63  EXPECT_EQ(std::string("\u0c26\u0c4d\u200c\u0c01"), result);
64 
65  str = "\u0995\u0983\u0981"; // KA visarga candrabindu
67  GraphemeNorm::kNormalize, str.c_str(),
68  &result));
69 
70  // Exception: Malayalam allows multiple anusvara.
71  str = "\u0d15\u0d02\u0d02"; // KA Anusvara Anusvara
73  GraphemeNorm::kNormalize, str.c_str(),
74  &result));
75  EXPECT_EQ(str, result);
76 }
77 
78 // [c28][c02][c3f]
79 // A consonant (NA) followed by the Anusvara/sunna and another matra (I).
80 // The anusvara [c02] is a pronunciation directive
81 // for a whole syllable and only appears at the end of the syllable
82 // References:
83 // + Unicode v9, 12.1 "Modifier Mark Rules R10,"
84 // and the Microsoft page
85 // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
86 TEST(ValidateIndicTest, VowelModifierMustBeLast) {
87  std::string str = "\u0c28\u0c02\u0c3f"; // NA Sunna I
90  GraphemeNorm::kNormalize, str.c_str(),
91  &dest))
93  // Swap c02/c3f and all is ok.
94  str = "\u0c28\u0c3f\u0c02"; // NA I Sunna
96  GraphemeNorm::kNormalize, str.c_str(), &dest))
98  EXPECT_EQ(dest, str);
99 }
100 
101 // [c05][c47]
102 // A Vowel (A) followed by a combining vowel/matra (EE).
103 // In Telugu, matras are only put on consonants, not independent
104 // vowels.
105 // References:
106 // + Unicode v9, 12.1:
107 // Principles of the Devanagari Script: Dependent Vowel Signs (Matras).
108 // + http://varamozhi.sourceforge.net/iscii91.pdf
109 TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) {
110  std::string str = "\u0c05\u0c47"; // A EE
113  GraphemeNorm::kNormalize, str.c_str(),
114  &dest))
116  str = "\u0c1e\u0c3e"; // NYA AA
118  GraphemeNorm::kNormalize, str.c_str(), &dest))
120  EXPECT_EQ(dest, str);
121 }
122 
123 // Sub-graphemes are allowed if GraphemeNorm is turned off.
124 TEST(ValidateIndicTest, SubGraphemes) {
125  std::string str = "\u0d3e"; // AA
128  GraphemeNorm::kNormalize, str.c_str(),
129  &dest))
132  GraphemeNorm::kNone, str.c_str(), &dest))
134  EXPECT_EQ(dest, str);
135 }
136 
137 TEST(ValidateIndicTest, Nukta) {
138  std::string str = "\u0c95\u0cbc\u0ccd\u0cb9"; // KA Nukta Virama HA
139  std::vector<std::string> glyphs;
140  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
142  true, str.c_str(), &glyphs));
143  EXPECT_EQ(glyphs.size(), 3);
144  EXPECT_EQ(glyphs[2], std::string("\u0ccd\u0cb9"));
145  // Swapped Nukta and Virama are not allowed, but NFC normalization fixes it.
146  std::string str2 = "\u0c95\u0ccd\u0cbc\u0cb9"; // KA Virama Nukta HA
148 }
149 
150 // Sinhala has some of its own specific rules. See www.macciato.com/sinhala
151 TEST(ValidateIndicTest, SinhalaRakaransaya) {
152  std::string str = "\u0d9a\u0dca\u200d\u0dbb"; // KA Virama ZWJ Rayanna
155  GraphemeNorm::kNormalize, str.c_str(), &dest))
157  EXPECT_EQ(dest, str);
158  std::vector<std::string> glyphs;
159  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
161  true, str.c_str(), &glyphs));
162  EXPECT_EQ(glyphs.size(), 2);
163  EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dbb"));
164  // Can be followed by a dependent vowel.
165  str += "\u0dd9"; // E
167  GraphemeNorm::kNormalize, str.c_str(), &dest))
169  EXPECT_EQ(dest, str);
170 }
171 
172 TEST(ValidateIndicTest, SinhalaYansaya) {
173  std::string str = "\u0d9a\u0dca\u200d\u0dba"; // KA Virama ZWJ Yayanna
176  GraphemeNorm::kNormalize, str.c_str(), &dest))
178  EXPECT_EQ(dest, str);
179  // Can be followed by a dependent vowel.
180  str += "\u0ddd"; // OO
182  GraphemeNorm::kNormalize, str.c_str(), &dest))
184  EXPECT_EQ(dest, str);
185  std::vector<std::string> glyphs;
186  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
188  true, str.c_str(), &glyphs));
189  EXPECT_EQ(glyphs.size(), 3);
190  EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dba"));
191 }
192 
193 TEST(ValidateIndicTest, SinhalaRepaya) {
194  std::string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8"; // KA Rayanna Virama ZWJ MA
195  std::vector<std::string> glyphs;
196  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
198  str.c_str(), &glyphs));
199  EXPECT_EQ(glyphs.size(), 2);
200  EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d\u0db8"));
201  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
203  true, str.c_str(), &glyphs));
204  EXPECT_EQ(glyphs.size(), 3);
205  EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));
206 }
207 
208 TEST(ValidateIndicTest, SinhalaSpecials) {
209  // Sinhala has some exceptions from the usual rules.
210  std::string str = "\u0dc0\u0d9c\u0dca\u200d\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d";
211  std::vector<std::string> glyphs;
212  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
214  true, str.c_str(), &glyphs));
215  EXPECT_EQ(glyphs.size(), 5) << PrintStringVectorWithUnicodes(glyphs);
216  EXPECT_EQ(glyphs[0], std::string("\u0dc0"));
217  EXPECT_EQ(glyphs[1], std::string("\u0d9c"));
218  EXPECT_EQ(glyphs[2], std::string("\u0dca\u200d\u0dbb"));
219  EXPECT_EQ(glyphs[3], std::string("\u0dca\u200d"));
220  EXPECT_EQ(glyphs[4], std::string("\u0dbb\u0dca\u200d"));
221  str = "\u0dc3\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d\u0dcf";
222  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
224  true, str.c_str(), &glyphs));
225  EXPECT_EQ(glyphs.size(), 4) << PrintStringVectorWithUnicodes(glyphs);
226  EXPECT_EQ(glyphs[0], std::string("\u0dc3"));
227  EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));
228  EXPECT_EQ(glyphs[2], std::string("\u0dbb\u0dca\u200d"));
229  EXPECT_EQ(glyphs[3], std::string("\u0dcf"));
230 }
231 
232 } // namespace
233 } // namespace tesseract
tesseract::OCRNorm::kNone
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::NormalizeUTF8String
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:163
tesseract::PrintStringVectorWithUnicodes
std::string PrintStringVectorWithUnicodes(const std::vector< std::string > &glyphs)
Definition: normstrngs_test.h:39
normstrngs_test.h
tesseract::NormalizeCleanAndSegmentUTF8
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:188
include_gunit.h
tesseract::GraphemeNormMode::kGlyphSplit
tesseract::PrintString32WithUnicodes
std::string PrintString32WithUnicodes(const std::string &str)
Definition: normstrngs_test.h:34
tesseract
Definition: baseapi.h:65
tesseract::ExpectGraphemeModeResults
void ExpectGraphemeModeResults(const std::string &str, UnicodeNormMode u_mode, int unicode_count, int glyph_count, int grapheme_count, const std::string &target_str)
Definition: normstrngs_test.h:48
normstrngs.h
tesstrain_utils.dest
dest
Definition: tesstrain_utils.py:139
tesseract::UnicodeNormMode::kNFC
tesseract::GraphemeNormMode::kCombined
tesseract::OCRNorm::kNormalize