tesseract  5.0.0-alpha-619-ge9db
validate_javanese.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: validate_javanese.cpp
3  * Description: Text validator for Javanese Script - aksara jawa.
4  * Author: Shree Devi Kumar
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  * http://www.apache.org/licenses/LICENSE-2.0
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  **********************************************************************/
17 
18 #include "validate_javanese.h"
19 #include "errcode.h"
20 #include "tprintf.h"
21 
22 namespace tesseract {
23 
24 // Returns whether codes matches the pattern for a Javanese Grapheme.
25 // Taken from unicode standard:
26 // http://www.unicode.org/charts/PDF/UA980.pdf
27 // http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
28 // The Consonant class here includes independent vowels.
29 // The order of components in an orthographic syllable as expressed in BNF is:
30 // {C F} C {{R}Y} {V{A}} {Z}
31 // Translated to the codes used by the CharClass enum:
32 // [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v]
33 // Also see https://r12a.github.io/scripts/javanese/ for detailed notes.
34 // Validation rules copied from validate_indic.cpp and modified for Javanese.
35 // Indic - for reference
36 // + vowel Grapheme: V[D](v)*
37 // + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
38 
40  switch (codes_[codes_used_].first) {
42  return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
43  case CharClass::kVowel:
45  return ConsumeVowelIfValid();
48  // Apart from within an aksara, joiners are silently dropped.
49  if (report_errors_)
50  tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
51  ++codes_used_;
52  return true;
54  UseMultiCode(1);
55  return true;
56  default:
57  if (report_errors_) {
58  tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
59  codes_[codes_used_].first, codes_[codes_used_].second);
60  }
61  return false;
62  }
63 }
64 
65 // Helper consumes/copies a virama and any associated post-virama joiners.
66 // A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
67 // no joiner at all) must be followed by a consonant.
68 // A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
69 // consonant, space, or character from a different script. We clean up the
70 // representation to make it consistent by adding a ZWNJ if missing from a
71 // non-linking virama. Returns false with an invalid sequence.
72 bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
73  const unsigned num_codes = codes_.size();
74  if (joiner.first == CharClass::kOther) {
76  if (codes_used_ < num_codes &&
77  codes_[codes_used_].second == kZeroWidthJoiner) {
78  // Post-matra viramas must be explicit, so no joiners allowed here.
79  if (post_matra) {
80  if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
81  return false;
82  }
83  if (codes_used_ + 1 < num_codes &&
84  codes_[codes_used_ - 2].second != kCakra &&
85  (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
86  codes_[codes_used_ + 1].second == kPengkal ||
87  codes_[codes_used_ + 1].second == kCakra)) {
88  // This combination will be picked up later.
90  } else {
91  // Half-form with optional Nukta.
92  unsigned len = output_.size() + 1 - output_used_;
93  if (UseMultiCode(len)) return true;
94  }
95  if (codes_used_ < num_codes &&
97  if (output_used_ == output_.size() ||
98  output_[output_used_] != kCakra) {
99  if (report_errors_) {
100  tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n",
101  static_cast<int>(script_));
102  }
103  return false;
104  }
105  // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
106  if (UseMultiCode(4)) return true;
107  }
108  } else if (codes_used_ == num_codes ||
110  post_matra) {
111  if (codes_used_ == num_codes ||
113  // It is valid to have an unterminated virama at the end of a word, but
114  // for consistency, we will always add ZWNJ if not present.
116  } else {
118  }
119  // Explicit virama [H z]
120  MultiCodePart(2);
121  }
122  } else {
123  // Pre-virama joiner [{Z|z} H] requests specific conjunct.
124  if (UseMultiCode(2)) {
125  if (report_errors_)
126  tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
127  return false;
128  }
129  if (codes_[codes_used_].second == kZeroWidthJoiner ||
131  if (report_errors_) {
132  tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
133  codes_[codes_used_].second);
134  }
135  return false;
136  }
137  }
138  // It is good so far as it goes.
139  return true;
140 }
141 
142 // Helper consumes/copies a series of consonants separated by viramas while
143 // valid, but not any vowel or other modifiers.
144 bool ValidateJavanese::ConsumeConsonantHeadIfValid() {
145  const unsigned num_codes = codes_.size();
146  // Consonant aksara
147  do {
149  // Special Sinhala case of [H Z Yayana/Rayana].
150  int index = output_.size() - 3;
151  if (output_used_ + 3 <= output_.size() &&
152  (output_.back() == kPengkal || output_.back() == kCakra) &&
153  IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
154  MultiCodePart(3);
155  }
156  bool have_nukta = false;
157  if (codes_used_ < num_codes &&
159  have_nukta = true;
161  }
162  // Test for subscript conjunct.
163  index = output_.size() - 2 - have_nukta;
164  if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() &&
165  IsVirama(output_[index])) {
166  // Output previous virama, consonant + optional nukta.
167  MultiCodePart(2 + have_nukta);
168  }
169  IndicPair joiner(CharClass::kOther, 0);
170  if (codes_used_ < num_codes &&
171  (codes_[codes_used_].second == kZeroWidthJoiner ||
172  (codes_[codes_used_].second == kZeroWidthNonJoiner &&
174  joiner = codes_[codes_used_];
175  if (++codes_used_ == num_codes) {
176  if (report_errors_) {
177  tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
178  joiner.second);
179  }
180  return true;
181  }
182  if (codes_[codes_used_].first == CharClass::kVirama) {
183  output_.push_back(joiner.second);
184  } else {
185  if (report_errors_) {
186  tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
187  output_.back(), joiner.second, codes_[codes_used_].second);
188  }
189  joiner = std::make_pair(CharClass::kOther, 0);
190  }
191  }
192  if (codes_used_ < num_codes &&
194  if (!ConsumeViramaIfValid(joiner, false)) return false;
195  } else {
196  break; // No virama, so the run of consonants is over.
197  }
198  } while (codes_used_ < num_codes &&
200  if (output_used_ < output_.size()) MultiCodePart(1);
201  return true;
202 }
203 
204 // Helper consumes/copies a tail part of a consonant, comprising optional
205 // matra/piece, vowel modifier, vedic mark, terminating virama.
206 bool ValidateJavanese::ConsumeConsonantTailIfValid() {
207  if (codes_used_ == codes_.size()) return true;
208  // No virama: Finish the grapheme.
209  // Are multiple matras allowed?
210  if (codes_[codes_used_].first == CharClass::kMatra) {
211  if (UseMultiCode(1)) return true;
212  if (codes_[codes_used_].first == CharClass::kMatraPiece) {
213  if (UseMultiCode(1)) return true;
214  }
215  }
216  // Tarung also used for long versions of u and o vowels and vocalic r
217  // Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ
218  while (codes_[codes_used_].first == CharClass::kMatraPiece) {
219  if (UseMultiCode(1)) return true;
220  }
221  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
222  if (UseMultiCode(1)) return true;
223  // Only Malayalam allows only repeated 0xd02.
224  if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break;
225  }
226  while (codes_[codes_used_].first == CharClass::kVedicMark) {
227  if (UseMultiCode(1)) return true;
228  }
229  if (codes_[codes_used_].first == CharClass::kVirama) {
230  if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
231  return false;
232  }
233  }
234  // What we have consumed so far is a valid consonant cluster.
235  if (output_used_ < output_.size()) MultiCodePart(1);
236 
237  return true;
238 }
239 
240 // Helper consumes/copies a vowel and optional modifiers.
241 bool ValidateJavanese::ConsumeVowelIfValid() {
242  if (UseMultiCode(1)) return true;
243  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
244  if (UseMultiCode(1)) return true;
245  // Only Malayalam allows repeated modifiers?
246  if (script_ != ViramaScript::kMalayalam) break;
247  }
248  while (codes_[codes_used_].first == CharClass::kVedicMark) {
249  if (UseMultiCode(1)) return true;
250  }
251  // What we have consumed so far is a valid vowel cluster.
252  return true;
253 }
254 
255 
259  // Offset from the start of the relevant unicode code block aka code page.
260  int off = ch - static_cast<char32>(script_);
261  // Anything in another code block is other.
262  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
263  if (off < 0x4) return CharClass::kVowelModifier;
264  if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
265  if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
266  if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
267  if (off <= 0x39) return CharClass::kMatra;
268  if (off <= 0x3a) return CharClass::kConsonant; // A9BA TALING - pre base vowel
269  if (off <= 0x3d) return CharClass::kMatra;
270  if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
271  if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
272  return CharClass::kOther;
273 }
274 
275 } // namespace tesseract
tesseract::Validator::MultiCodePart
void MultiCodePart(unsigned length)
Definition: validator.h:196
tesseract::Validator::CharClass::kZeroWidthJoiner
tesseract::Validator::CharClass::kVowelModifier
tesseract::Validator::UseMultiCode
bool UseMultiCode(unsigned length)
Definition: validator.h:210
tesseract::Validator::IsSubscriptScript
bool IsSubscriptScript() const
Definition: validator.cpp:198
tesseract::ValidateJavanese::UnicodeToCharClass
Validator::CharClass UnicodeToCharClass(char32 ch) const override
Definition: validate_javanese.cpp:270
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::ValidateJavanese::ConsumeGraphemeIfValid
bool ConsumeGraphemeIfValid() override
Definition: validate_javanese.cpp:53
tesseract::Validator::IndicPair
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:148
validate_javanese.h
tesseract::Validator::CharClass::kMatra
tesseract::Validator::IsVirama
static bool IsVirama(char32 unicode)
Definition: validator.cpp:180
tesseract::Validator::codes_
std::vector< IndicPair > codes_
Definition: validator.h:246
tesseract::Validator::kIndicCodePageSize
static const int kIndicCodePageSize
Definition: validator.h:228
tesseract::Validator::kZeroWidthNonJoiner
static const char32 kZeroWidthNonJoiner
Definition: validator.h:110
tesseract::Validator::CharClass
CharClass
Definition: validator.h:126
tesseract::ViramaScript::kMalayalam
tesseract::Validator::output_used_
unsigned output_used_
Definition: validator.h:254
tesseract::Validator::output_
std::vector< char32 > output_
Definition: validator.h:250
tesseract::Validator::kZeroWidthJoiner
static const char32 kZeroWidthJoiner
Definition: validator.h:111
tesseract::Validator::report_errors_
bool report_errors_
Definition: validator.h:256
tesseract
Definition: baseapi.h:65
tesseract::Validator::CharClass::kVedicMark
tprintf.h
tesseract::Validator::codes_used_
unsigned codes_used_
Definition: validator.h:252
tesseract::Validator::CharClass::kVirama
tesseract::Validator::CodeOnlyToOutput
bool CodeOnlyToOutput()
Definition: validator.h:186
tesseract::Validator::CharClass::kConsonant
tesseract::Validator::CharClass::kNukta
char32
signed int char32
Definition: pango_font_info.h:33
tesseract::Validator::CharClass::kVowel
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
errcode.h
tesseract::Validator::CharClass::kOther
tesseract::Validator::script_
ViramaScript script_
Definition: validator.h:244
tesseract::Validator::CharClass::kZeroWidthNonJoiner
tesseract::Validator::CharClass::kMatraPiece