3 #include "unicode/uchar.h" 8 int num_codes =
codes_.size();
12 int num_codes_in_grapheme = 0;
16 const bool is_combiner =
25 tprintf(
"Two grapheme links in a row:0x%x 0x%x\n", prev_ch, ch);
29 IsBadlyFormed(prev_ch, ch)) {
32 bool prev_is_fwd_combiner =
36 if (num_codes_in_grapheme > 0 && !is_combiner && !prev_is_fwd_combiner)
39 ++num_codes_in_grapheme;
40 prev_prev_ch = prev_ch;
44 if (num_codes_in_grapheme > 0)
MultiCodePart(num_codes_in_grapheme);
56 int char_type = u_charType(ch);
57 if (char_type == U_NON_SPACING_MARK || char_type == U_ENCLOSING_MARK ||
65 bool ValidateGrapheme::IsBadlyFormed(
char32 prev_ch,
char32 ch) {
67 if (IsBadlyFormedIndicVowel(prev_ch, ch)) {
69 tprintf(
"Badly formed Indic vowel sequence:0x%x 0x%x\n", prev_ch, ch);
72 if (IsBadlyFormedThai(prev_ch, ch)) {
92 bool ValidateGrapheme::IsBadlyFormedIndicVowel(
char32 prev_ch,
char32 ch) {
93 return ((prev_ch == 0x905 && (ch == 0x946 || ch == 0x93E)) ||
94 (prev_ch == 0x909 && ch == 0x941) ||
95 (prev_ch == 0x90F && (ch >= 0x945 && ch <= 0x947)) ||
96 (prev_ch == 0x905 && (ch >= 0x949 && ch <= 0x94C)) ||
97 (prev_ch == 0x906 && (ch >= 0x949 && ch <= 0x94C)) ||
99 (prev_ch == 0x93E && (ch >= 0x945 && ch <= 0x948)) ||
101 (prev_ch == 0x94D && (ch >= 0x93E && ch <= 0x94C)) ||
103 (prev_ch == 0x985 && ch == 0x9BE) ||
105 (prev_ch == 0xC12 && (ch == 0xC55 || ch == 0xC4C)) ||
107 (prev_ch == 0xC92 && ch == 0xCCC));
111 static bool IsThaiConsonant(
char32 ch) {
return 0xe01 <= ch && ch <= 0xe2e; }
114 static bool IsThaiBeforeConsonantVowel(
char32 ch) {
115 return 0xe40 <= ch && ch <= 0xe44;
119 static bool IsThaiToneMark(
char32 ch) {
return 0xe48 <= ch && ch <= 0xe4b; }
123 static bool IsThaiTonableVowel(
char32 ch) {
124 return (0xe34 <= ch && ch <= 0xe39) || ch == 0xe31;
133 bool ValidateGrapheme::IsBadlyFormedThai(
char32 prev_ch,
char32 ch) {
135 if (IsThaiToneMark(ch) &&
136 !(IsThaiConsonant(prev_ch) || IsThaiTonableVowel(prev_ch))) {
140 if ((IsThaiTonableVowel(ch) || ch == 0xe47) && !IsThaiConsonant(prev_ch)) {
145 !(IsThaiConsonant(prev_ch) || prev_ch == 0xe38 || prev_ch == 0xe34)) {
152 !(IsThaiConsonant(prev_ch) || prev_ch == 0xe48 || prev_ch == 0xe49)) {
156 if ((ch == 0xe30 || ch == 0xe32 || ch == 0xe33) &&
157 !(IsThaiConsonant(prev_ch) || IsThaiToneMark(prev_ch)) &&
158 !(prev_ch == 0xe32 && ch == 0xe30) &&
159 !(prev_ch == 0xe4d && ch == 0xe32)) {
164 if (IsThaiBeforeConsonantVowel(ch) &&
165 (IsThaiBeforeConsonantVowel(prev_ch) || prev_ch == 0xe31 ||
170 if ((0xe30 <= ch && ch <= 0xe4D) && prev_ch == 0xe24) {
std::vector< IndicPair > codes_
static bool IsVedicAccent(char32 unicode)
static const char32 kZeroWidthNonJoiner
CharClass UnicodeToCharClass(char32 ch) const override
void MultiCodePart(int length)
bool ConsumeGraphemeIfValid() override
DLLSYM void tprintf(const char *format,...)
static const char32 kZeroWidthJoiner