tesseract  5.0.0-alpha-619-ge9db
validate_indic.cpp
Go to the documentation of this file.
1 #include "validate_indic.h"
2 #include "errcode.h"
3 #include "tprintf.h"
4 
5 namespace tesseract {
6 
7 // Returns whether codes matches the pattern for an Indic Grapheme.
8 // The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf
9 // has a BNF for valid syllables (Graphemes) which is modified slightly
10 // for Unicode. Notably U+200C and U+200D are used before/after the
11 // virama/virama to express explicit or soft viramas.
12 // Also the unicode v.9 Malayalam entry states that CZHC can be used in several
13 // Indic languages to request traditional ligatures, and CzHC is Malayalam-
14 // specific for requesting open conjuncts.
15 //
16 // + vowel Grapheme: V[D](v)*
17 // + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
19  switch (codes_[codes_used_].first) {
21  return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
22  case CharClass::kVowel:
24  return ConsumeVowelIfValid();
27  // Apart from within an aksara, joiners are silently dropped.
28  if (report_errors_)
29  tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
30  ++codes_used_;
31  return true;
32  case CharClass::kOther:
33  UseMultiCode(1);
34  return true;
35  default:
36  if (report_errors_) {
37  tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
38  codes_[codes_used_].first, codes_[codes_used_].second);
39  }
40  return false;
41  }
42 }
43 
45  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
48  // Offset from the start of the relevant unicode code block aka code page.
49  int base = static_cast<char32>(script_);
50  int off = ch - base;
51  // Anything in another code block is other.
52  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
53  // Exception for Tamil. The aytham character is considered a letter.
54  if (script_ == ViramaScript::kTamil && off == 0x03) return CharClass::kVowel;
55  if (off < 0x4) return CharClass::kVowelModifier;
57  // Sinhala is an exception.
58  if (off <= 0x19) return CharClass::kVowel;
59  if (off <= 0x49) return CharClass::kConsonant;
60  if (off == 0x4a) return CharClass::kVirama;
61  if (off <= 0x5f) return CharClass::kMatra;
62  } else {
63  if (off <= 0x14 || off == 0x50) return CharClass::kVowel;
64  if (off <= 0x3b || (0x58 <= off && off <= 0x5f))
65  return CharClass::kConsonant;
66  // Sinhala doesn't have Nukta or Avagraha.
67  if (off == 0x3c) return CharClass::kNukta;
68  if (off == 0x3d) return CharClass::kVowel; // avagraha
69  if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra;
70  if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;
71  if (off == 0x4d) return CharClass::kVirama;
72  }
73  if (off == 0x60 || off == 0x61) return CharClass::kVowel;
74  if (off == 0x62 || off == 0x63) return CharClass::kMatra;
75  // Danda and digits up to 6f are OK as other.
76  // 70-7f are script-specific.
77  // 0BF0-0BF2 are Tamil numbers 10, 100 and 1000; treat as other.
78  if (script_ == ViramaScript::kTamil && (0x70 <= off && off <= 0x72))
79  return CharClass::kOther;
80  // 0BF3-0BFA are other Tamil symbols.
81  if (script_ == ViramaScript::kTamil && (0x73 <= off && off <= 0x7A))
82  return CharClass::kOther;
83  if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71))
84  return CharClass::kConsonant;
85  if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73))
86  return CharClass::kConsonant;
87  if (script_ == ViramaScript::kSinhala && off == 0x70)
88  return CharClass::kConsonant;
89  if (script_ == ViramaScript::kDevanagari && off == 0x70)
90  return CharClass::kOther;
91  if (0x70 <= off && off <= 0x73) return CharClass::kVowelModifier;
92  // Non Indic, Digits, Measures, danda, etc.
93  return CharClass::kOther;
94 }
95 
96 // Helper consumes/copies a virama and any associated post-virama joiners.
97 // A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
98 // no joiner at all) must be followed by a consonant.
99 // A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
100 // consonant, space, or character from a different script. We clean up the
101 // representation to make it consistent by adding a ZWNJ if missing from a
102 // non-linking virama. Returns false with an invalid sequence.
103 bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
104  const unsigned num_codes = codes_.size();
105  if (joiner.first == CharClass::kOther) {
107  if (codes_used_ < num_codes &&
108  codes_[codes_used_].second == kZeroWidthJoiner) {
109  // Post-matra viramas must be explicit, so no joiners allowed here.
110  if (post_matra) {
111  if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
112  return false;
113  }
114  if (codes_used_ + 1 < num_codes &&
115  codes_[codes_used_ - 2].second != kRayana &&
116  (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
117  codes_[codes_used_ + 1].second == kYayana ||
118  codes_[codes_used_ + 1].second == kRayana)) {
119  // This combination will be picked up later.
121  } else {
122  // Half-form with optional Nukta.
123  unsigned len = output_.size() + 1 - output_used_;
124  if (UseMultiCode(len)) return true;
125  }
126  if (codes_used_ < num_codes &&
128  if (output_used_ == output_.size() ||
129  output_[output_used_] != kRayana) {
130  if (report_errors_) {
131  tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n",
132  static_cast<int>(script_));
133  }
134  return false;
135  }
136  // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
137  if (UseMultiCode(4)) return true;
138  }
139  } else if (codes_used_ == num_codes ||
141  post_matra) {
142  if (codes_used_ == num_codes ||
144  // It is valid to have an unterminated virama at the end of a word, but
145  // for consistency, we will always add ZWNJ if not present.
146  output_.push_back(kZeroWidthNonJoiner);
147  } else {
149  }
150  // Explicit virama [H z]
151  MultiCodePart(2);
152  }
153  } else {
154  // Pre-virama joiner [{Z|z} H] requests specific conjunct.
155  if (UseMultiCode(2)) {
156  if (report_errors_)
157  tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
158  return false;
159  }
160  if (codes_[codes_used_].second == kZeroWidthJoiner ||
162  if (report_errors_) {
163  tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
164  codes_[codes_used_].second);
165  }
166  return false;
167  }
168  }
169  // It is good so far as it goes.
170  return true;
171 }
172 
173 // Helper consumes/copies a series of consonants separated by viramas while
174 // valid, but not any vowel or other modifiers.
175 bool ValidateIndic::ConsumeConsonantHeadIfValid() {
176  const unsigned num_codes = codes_.size();
177  // Consonant aksara
178  do {
180  // Special Sinhala case of [H Z Yayana/Rayana].
181  int index = output_.size() - 3;
182  if (output_used_ + 3 <= output_.size() &&
183  (output_.back() == kYayana || output_.back() == kRayana) &&
184  IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
185  MultiCodePart(3);
186  }
187  bool have_nukta = false;
188  if (codes_used_ < num_codes &&
190  have_nukta = true;
192  }
193  // Test for subscript conjunct.
194  index = output_.size() - 2 - have_nukta;
195  if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() &&
196  IsVirama(output_[index])) {
197  // Output previous virama, consonant + optional nukta.
198  MultiCodePart(2 + have_nukta);
199  }
200  IndicPair joiner(CharClass::kOther, 0);
201  if (codes_used_ < num_codes &&
202  (codes_[codes_used_].second == kZeroWidthJoiner ||
203  (codes_[codes_used_].second == kZeroWidthNonJoiner &&
205  joiner = codes_[codes_used_];
206  if (++codes_used_ == num_codes) {
207  if (report_errors_) {
208  tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
209  joiner.second);
210  }
211  return true;
212  }
213  if (codes_[codes_used_].first == CharClass::kVirama) {
214  output_.push_back(joiner.second);
215  } else {
216  if (report_errors_) {
217  tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
218  output_.back(), joiner.second, codes_[codes_used_].second);
219  }
220  joiner = std::make_pair(CharClass::kOther, 0);
221  }
222  }
223  if (codes_used_ < num_codes &&
225  if (!ConsumeViramaIfValid(joiner, false)) return false;
226  } else {
227  break; // No virama, so the run of consonants is over.
228  }
229  } while (codes_used_ < num_codes &&
231  if (output_used_ < output_.size()) MultiCodePart(1);
232  return true;
233 }
234 
235 // Helper consumes/copies a tail part of a consonant, comprising optional
236 // matra/piece, vowel modifier, vedic mark, terminating virama.
237 bool ValidateIndic::ConsumeConsonantTailIfValid() {
238  if (codes_used_ == codes_.size()) return true;
239  // No virama: Finish the grapheme.
240  // Are multiple matras allowed?
241  if (codes_[codes_used_].first == CharClass::kMatra) {
242  if (UseMultiCode(1)) return true;
243  if (codes_[codes_used_].first == CharClass::kMatraPiece) {
244  if (UseMultiCode(1)) return true;
245  }
246  }
247  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
248  if (UseMultiCode(1)) return true;
249  // Only Malayalam allows only repeated 0xd02.
250  if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break;
251  }
252  while (codes_[codes_used_].first == CharClass::kVedicMark) {
253  if (UseMultiCode(1)) return true;
254  }
255  if (codes_[codes_used_].first == CharClass::kVirama) {
256  if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
257  return false;
258  }
259  }
260  // What we have consumed so far is a valid consonant cluster.
261  if (output_used_ < output_.size()) MultiCodePart(1);
262 
263  return true;
264 }
265 
266 // Helper consumes/copies a vowel and optional modifiers.
267 bool ValidateIndic::ConsumeVowelIfValid() {
268  if (UseMultiCode(1)) return true;
269  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
270  if (UseMultiCode(1)) return true;
271  // Only Malayalam allows repeated modifiers?
272  if (script_ != ViramaScript::kMalayalam) break;
273  }
274  while (codes_[codes_used_].first == CharClass::kVedicMark) {
275  if (UseMultiCode(1)) return true;
276  }
277  // What we have consumed so far is a valid vowel cluster.
278  return true;
279 }
280 
281 } // namespace tesseract
tesseract::Validator::MultiCodePart
void MultiCodePart(unsigned length)
Definition: validator.h:196
tesseract::Validator::CharClass::kZeroWidthJoiner
tesseract::Validator::CharClass::kVowelModifier
validate_indic.h
tesseract::Validator::IsVedicAccent
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:191
tesseract::Validator::UseMultiCode
bool UseMultiCode(unsigned length)
Definition: validator.h:210
tesseract::Validator::IsSubscriptScript
bool IsSubscriptScript() const
Definition: validator.cpp:198
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::Validator::IndicPair
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:148
tesseract::ValidateIndic::ConsumeGraphemeIfValid
bool ConsumeGraphemeIfValid() override
Definition: validate_indic.cpp:18
tesseract::Validator::CharClass::kMatra
tesseract::Validator::IsVirama
static bool IsVirama(char32 unicode)
Definition: validator.cpp:180
tesseract::Validator::codes_
std::vector< IndicPair > codes_
Definition: validator.h:246
tesseract::Validator::kIndicCodePageSize
static const int kIndicCodePageSize
Definition: validator.h:228
tesseract::Validator::kZeroWidthNonJoiner
static const char32 kZeroWidthNonJoiner
Definition: validator.h:110
tesseract::ViramaScript::kTamil
tesseract::Validator::CharClass
CharClass
Definition: validator.h:126
tesseract::ViramaScript::kMalayalam
tesseract::Validator::output_used_
unsigned output_used_
Definition: validator.h:254
tesseract::ViramaScript::kBengali
tesseract::Validator::output_
std::vector< char32 > output_
Definition: validator.h:250
tesseract::char32
signed int char32
Definition: unichar.h:53
tesseract::ViramaScript::kDevanagari
tesseract::Validator::kZeroWidthJoiner
static const char32 kZeroWidthJoiner
Definition: validator.h:111
tesseract::Validator::report_errors_
bool report_errors_
Definition: validator.h:256
tesseract
Definition: baseapi.h:65
tesseract::Validator::CharClass::kVedicMark
tprintf.h
tesseract::Validator::codes_used_
unsigned codes_used_
Definition: validator.h:252
tesseract::Validator::CharClass::kVirama
tesseract::Validator::CodeOnlyToOutput
bool CodeOnlyToOutput()
Definition: validator.h:186
tesseract::Validator::CharClass::kConsonant
tesseract::Validator::CharClass::kNukta
tesseract::Validator::CharClass::kVowel
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
errcode.h
tesseract::ViramaScript::kSinhala
tesseract::Validator::CharClass::kOther
tesseract::ValidateIndic::UnicodeToCharClass
Validator::CharClass UnicodeToCharClass(char32 ch) const override
Definition: validate_indic.cpp:44
tesseract::ViramaScript::kGurmukhi
tesseract::Validator::script_
ViramaScript script_
Definition: validator.h:244
tesseract::Validator::CharClass::kZeroWidthNonJoiner
tesseract::Validator::CharClass::kMatraPiece