tesseract  4.0.0-1-g2a2b
validate_myanmar.cpp
Go to the documentation of this file.
1 #include "validate_myanmar.h"
2 #include "errcode.h"
3 #include "icuerrorcode.h"
4 #include "tprintf.h"
5 #include "unicode/uchar.h" // From libicu
6 #include "unicode/uscript.h" // From libicu
7 
8 namespace tesseract {
9 
10 // Returns whether codes matches the pattern for a Myanmar Grapheme.
11 // Taken directly from the unicode table 16-3.
12 // See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
14  int num_codes = codes_.size();
15  if (codes_used_ == num_codes) return true;
16  // Other.
17  if (IsMyanmarOther(codes_[codes_used_].second)) {
18  UseMultiCode(1);
19  return true;
20  }
21  // Kinzi.
22  if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
23  codes_[codes_used_ + 1].second == kMyanmarAsat &&
24  codes_[codes_used_ + 2].second == kMyanmarVirama) {
27  if (UseMultiCode(3)) return true;
28  }
29  // Base consonant/vowel. NOTE that since everything in Myanmar appears to be
30  // optional, except the base, this is the only place where invalid input can
31  // be detected and false returned.
32  if (IsMyanmarLetter(codes_[codes_used_].second)) {
33  if (UseMultiCode(1)) return true;
34  } else {
35  if (report_errors_) {
36  tprintf("Invalid start of Myanmar syllable:0x%x\n",
37  codes_[codes_used_].second);
38  }
39  return false; // One of these is required.
40  }
41  if (ConsumeSubscriptIfPresent()) return true;
42  ConsumeOptionalSignsIfPresent();
43  // What we have consumed so far is a valid syllable.
44  return true;
45 }
46 
47 // TODO(rays) Doesn't use intermediate coding like the other scripts, as there
48 // is little correspondence between the content of table 16-3 and the char
49 // classes of the Indic languages. (Experts may disagree and improve!)
50 // In unicode table 16-3 there is basically a long list of optional characters,
51 // which can be coded quite easily.
52 // Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
53 // The table also allows sequences that still result in dotted circles!!
54 // So with a lot of guesswork the rest have been added in a reasonable place.
56  if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
57  return CharClass::kOther;
58 }
59 
60 // Helper consumes/copies a virama and any subscript consonant.
61 // Returns true if the end of input is reached.
62 bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
63  // Subscript consonant. It appears there can be only one.
64  int num_codes = codes_.size();
65  if (codes_used_ + 1 < num_codes &&
66  codes_[codes_used_].second == kMyanmarVirama) {
67  if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
69  if (UseMultiCode(2)) return true;
70  }
71  }
72  return false;
73 }
74 
75 // Helper consumes/copies a series of optional signs.
76 // Returns true if the end of input is reached.
77 bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
78  // The following characters are allowed, all optional, and in sequence.
79  // An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
80  const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
81  0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
82  0x1081, 0x1031});
83  for (char32 ch : kMedials) {
84  if (codes_[codes_used_].second == ch) {
85  if (UseMultiCode(1)) return true;
86  if (ch == kMyanmarMedialYa &&
87  codes_[codes_used_].second == kMyanmarAsat) {
88  if (UseMultiCode(1)) return true;
89  }
90  }
91  }
92  // Vowel sign i, ii, ai.
93  char32 ch = codes_[codes_used_].second;
94  if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
95  if (UseMultiCode(1)) return true;
96  }
97  // Vowel sign u, uu, and extensions.
98  ch = codes_[codes_used_].second;
99  if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
100  ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
101  (0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
102  ch == 0x109c || ch == 0x109d) {
103  if (UseMultiCode(1)) return true;
104  }
105  // Tall aa, aa with optional asat.
106  if (codes_[codes_used_].second == 0x102b ||
107  codes_[codes_used_].second == 0x102c) {
108  if (UseMultiCode(1)) return true;
109  if (codes_[codes_used_].second == kMyanmarAsat) {
110  if (UseMultiCode(1)) return true;
111  }
112  }
113  // The following characters are allowed, all optional, and in sequence.
114  const std::vector<char32> kSigns({0x1036, 0x1037});
115  for (char32 ch : kSigns) {
116  if (codes_[codes_used_].second == ch) {
117  if (UseMultiCode(1)) return true;
118  }
119  }
120  // Tone mark extensions.
121  ch = codes_[codes_used_].second;
122  if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
123  (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
124  ch == 0x108f || ch == 0x109a || ch == 0x109b ||
125  (0xaa7b <= ch && ch <= 0xaa7d)) {
126  if (UseMultiCode(1)) return true;
127  }
128  return false;
129 }
130 
131 // Returns true if the unicode is a Myanmar "letter" including consonants
132 // and independent vowels. Although table 16-3 distinguishes between some
133 // base consonants and vowels, the extensions make no such distinction, so we
134 // put them all into a single bucket.
135 /* static */
136 bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
137  return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
138  (0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
139  ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
140  (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
141  ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
142  (0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
143  ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
144 }
145 
146 // Returns true if ch is a Myanmar digit or other symbol that does not take
147 // part in being a syllable.
148 /* static */
149 bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
150  IcuErrorCode err;
151  UScriptCode script_code = uscript_getScript(ch, err);
152  if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
154  return true;
155  return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
156  (0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
157  (0xaa74 <= ch && ch <= 0xaa79);
158 }
159 
160 } // namespace tesseract
std::vector< IndicPair > codes_
Definition: validator.h:232
signed int char32
bool ConsumeGraphemeIfValid() override
static const char32 kZeroWidthNonJoiner
Definition: validator.h:96
signed int char32
Definition: unichar.h:52
static const char32 kMyanmarVirama
Definition: validator.h:223
Validator::CharClass UnicodeToCharClass(char32 ch) const override
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool UseMultiCode(int length)
Definition: validator.h:196
static const char32 kZeroWidthJoiner
Definition: validator.h:97
#define ASSERT_HOST(x)
Definition: errcode.h:84