tesseract  5.0.0-alpha-619-ge9db
validate_grapheme.cpp
Go to the documentation of this file.
1 #include "validate_grapheme.h"
2 #include "tprintf.h"
3 #include "unicode/uchar.h" // From libicu
4 
5 namespace tesseract {
6 
8  const unsigned num_codes = codes_.size();
9  char32 prev_prev_ch = ' ';
10  char32 prev_ch = ' ';
12  int num_codes_in_grapheme = 0;
13  while (codes_used_ < num_codes) {
14  CharClass cc = codes_[codes_used_].first;
15  char32 ch = codes_[codes_used_].second;
16  const bool is_combiner =
18  // TODO: Make this code work well with RTL text.
19  // See https://github.com/tesseract-ocr/tesseract/pull/2266#issuecomment-467114751
20  #if 0
21  // Reject easily detected badly formed sequences.
22  if (prev_cc == CharClass::kWhitespace && is_combiner) {
23  if (report_errors_) tprintf("Word started with a combiner:0x%x\n", ch);
24  return false;
25  }
26  #endif
27  if (prev_cc == CharClass::kVirama && cc == CharClass::kVirama) {
28  if (report_errors_)
29  tprintf("Two grapheme links in a row:0x%x 0x%x\n", prev_ch, ch);
30  return false;
31  }
32  if (prev_cc != CharClass::kWhitespace && cc != CharClass::kWhitespace &&
33  IsBadlyFormed(prev_ch, ch)) {
34  return false;
35  }
36  bool prev_is_fwd_combiner =
37  prev_ch == kZeroWidthJoiner || prev_cc == CharClass::kVirama ||
38  (prev_ch == kZeroWidthNonJoiner &&
39  (cc == CharClass::kVirama || prev_prev_ch == kZeroWidthJoiner));
40  if (num_codes_in_grapheme > 0 && !is_combiner && !prev_is_fwd_combiner)
41  break;
43  ++num_codes_in_grapheme;
44  prev_prev_ch = prev_ch;
45  prev_ch = ch;
46  prev_cc = cc;
47  }
48  if (num_codes_in_grapheme > 0) MultiCodePart(num_codes_in_grapheme);
49  return true;
50 }
51 
53  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
54  // The ZeroWidth[Non]Joiner characters are mapped to kCombiner as they
55  // always combine with the previous character.
56  if (u_hasBinaryProperty(ch, UCHAR_GRAPHEME_LINK)) return CharClass::kVirama;
57  if (u_isUWhiteSpace(ch)) return CharClass::kWhitespace;
58  // Workaround for Javanese Aksara's Taling, do not label it as a combiner
59  if (ch == 0xa9ba) return CharClass::kConsonant;
60  int char_type = u_charType(ch);
61  if (char_type == U_NON_SPACING_MARK || char_type == U_ENCLOSING_MARK ||
62  char_type == U_COMBINING_SPACING_MARK || ch == kZeroWidthNonJoiner ||
63  ch == kZeroWidthJoiner)
64  return CharClass::kCombiner;
65  return CharClass::kOther;
66 }
67 
68 // Helper returns true if the sequence prev_ch,ch is invalid.
69 bool ValidateGrapheme::IsBadlyFormed(char32 prev_ch, char32 ch) {
70  // Reject badly formed Indic vowels.
71  if (IsBadlyFormedIndicVowel(prev_ch, ch)) {
72  if (report_errors_)
73  tprintf("Badly formed Indic vowel sequence:0x%x 0x%x\n", prev_ch, ch);
74  return true;
75  }
76  if (IsBadlyFormedThai(prev_ch, ch)) {
77  if (report_errors_) tprintf("Badly formed Thai:0x%x 0x%x\n", prev_ch, ch);
78  return true;
79  }
80  return false;
81 }
82 
83 // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
84 // Some vowels in Indic scripts may be analytically decomposed into atomic pairs
85 // of components that are themselves valid unicode symbols. (See Table 12-1 in
86 // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
87 // for examples in Devanagari). The Unicode standard discourages specifying
88 // vowels this way, but they are sometimes encountered in text, probably because
89 // some editors still permit it. Renderers however dislike such pairs, and so
90 // this function may be used to detect their occurrence for removal.
91 // TODO(rays) This function only covers a subset of Indic languages and doesn't
92 // include all rules. Add rules as appropriate to support other languages or
93 // find a way to generalize these existing rules that makes use of the
94 // regularity of the mapping from ISCII to Unicode.
95 /* static */
96 bool ValidateGrapheme::IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch) {
97  return ((prev_ch == 0x905 && (ch == 0x946 || ch == 0x93E)) ||
98  (prev_ch == 0x909 && ch == 0x941) ||
99  (prev_ch == 0x90F && (ch >= 0x945 && ch <= 0x947)) ||
100  (prev_ch == 0x905 && (ch >= 0x949 && ch <= 0x94C)) ||
101  (prev_ch == 0x906 && (ch >= 0x949 && ch <= 0x94C)) ||
102  // Illegal combinations of two dependent Devanagari vowels.
103  (prev_ch == 0x93E && (ch >= 0x945 && ch <= 0x948)) ||
104  // Dependent Devanagari vowels following a virama.
105  (prev_ch == 0x94D && (ch >= 0x93E && ch <= 0x94C)) ||
106  // Bengali vowels (Table 9-5, pg 313)
107  (prev_ch == 0x985 && ch == 0x9BE) ||
108  // Telugu vowels (Table 9-19, pg 331)
109  (prev_ch == 0xC12 && (ch == 0xC55 || ch == 0xC4C)) ||
110  // Kannada vowels (Table 9-20, pg 332)
111  (prev_ch == 0xC92 && ch == 0xCCC));
112 }
113 
114 // Helper returns true if ch is a Thai consonant.
115 static bool IsThaiConsonant(char32 ch) { return 0xe01 <= ch && ch <= 0xe2e; }
116 
117 // Helper returns true is ch is a before-consonant vowel.
118 static bool IsThaiBeforeConsonantVowel(char32 ch) {
119  return 0xe40 <= ch && ch <= 0xe44;
120 }
121 
122 // Helper returns true if ch is a Thai tone mark.
123 static bool IsThaiToneMark(char32 ch) { return 0xe48 <= ch && ch <= 0xe4b; }
124 
125 // Helper returns true if ch is a Thai vowel that may be followed by a tone
126 // mark.
127 static bool IsThaiTonableVowel(char32 ch) {
128  return (0xe34 <= ch && ch <= 0xe39) || ch == 0xe31;
129 }
130 
131 // Helper returns true if the sequence prev_ch,ch is invalid Thai.
132 // These rules come from a native Thai speaker, and are not covered by the
133 // Thai section in the unicode book:
134 // http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
135 // Comments below added by Ray interpreting the code ranges.
136 /* static */
137 bool ValidateGrapheme::IsBadlyFormedThai(char32 prev_ch, char32 ch) {
138  // Tone marks must follow consonants or specific vowels.
139  if (IsThaiToneMark(ch) &&
140  !(IsThaiConsonant(prev_ch) || IsThaiTonableVowel(prev_ch))) {
141  return true;
142  }
143  // Tonable vowels must follow consonants.
144  if ((IsThaiTonableVowel(ch) || ch == 0xe47) && !IsThaiConsonant(prev_ch)) {
145  return true;
146  }
147  // Thanthakhat must follow consonant or specific vowels.
148  if (ch == 0xe4c &&
149  !(IsThaiConsonant(prev_ch) || prev_ch == 0xe38 || prev_ch == 0xe34)) {
150  return true;
151  }
152  // Nikkhahit must follow a consonant ?or certain markers?.
153  // TODO(rays) confirm this, but there were so many in the ground truth of the
154  // validation set that it seems reasonable to assume it is valid.
155  if (ch == 0xe4d &&
156  !(IsThaiConsonant(prev_ch) || prev_ch == 0xe48 || prev_ch == 0xe49)) {
157  return true;
158  }
159  // The vowels e30, e32, e33 can be used more liberally.
160  if ((ch == 0xe30 || ch == 0xe32 || ch == 0xe33) &&
161  !(IsThaiConsonant(prev_ch) || IsThaiToneMark(prev_ch)) &&
162  !(prev_ch == 0xe32 && ch == 0xe30) &&
163  !(prev_ch == 0xe4d && ch == 0xe32)) {
164  return true;
165  }
166  // Some vowels come before consonants, and therefore cannot follow things
167  // that cannot end a syllable.
168  if (IsThaiBeforeConsonantVowel(ch) &&
169  (IsThaiBeforeConsonantVowel(prev_ch) || prev_ch == 0xe31 ||
170  prev_ch == 0xe37)) {
171  return true;
172  }
173  // Don't allow the standalone vowel U+0e24 to be followed by other vowels.
174  if ((0xe30 <= ch && ch <= 0xe4D) && prev_ch == 0xe24) {
175  return true;
176  }
177  return false;
178 }
179 
180 } // namespace tesseract
tesseract::Validator::MultiCodePart
void MultiCodePart(unsigned length)
Definition: validator.h:196
tesseract::Validator::IsVedicAccent
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:191
tesseract::Validator::CharClass::kCombiner
validate_grapheme.h
tesseract::Validator::codes_
std::vector< IndicPair > codes_
Definition: validator.h:246
tesseract::Validator::kZeroWidthNonJoiner
static const char32 kZeroWidthNonJoiner
Definition: validator.h:110
tesseract::Validator::CharClass
CharClass
Definition: validator.h:126
tesseract::char32
signed int char32
Definition: unichar.h:53
tesseract::Validator::kZeroWidthJoiner
static const char32 kZeroWidthJoiner
Definition: validator.h:111
tesseract::Validator::report_errors_
bool report_errors_
Definition: validator.h:256
tesseract
Definition: baseapi.h:65
tesseract::ValidateGrapheme::ConsumeGraphemeIfValid
bool ConsumeGraphemeIfValid() override
Definition: validate_grapheme.cpp:7
tesseract::Validator::CharClass::kVedicMark
tprintf.h
tesseract::Validator::codes_used_
unsigned codes_used_
Definition: validator.h:252
tesseract::Validator::CharClass::kVirama
tesseract::Validator::CodeOnlyToOutput
bool CodeOnlyToOutput()
Definition: validator.h:186
tesseract::ValidateGrapheme::UnicodeToCharClass
CharClass UnicodeToCharClass(char32 ch) const override
Definition: validate_grapheme.cpp:52
tesseract::Validator::CharClass::kConsonant
char32
signed int char32
Definition: pango_font_info.h:33
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::Validator::CharClass::kOther
tesseract::Validator::CharClass::kWhitespace