tesseract  4.0.0-1-g2a2b
validate_javanese.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: validate_javanese.cpp
3  * Description: Text validator for Javanese Script - aksara jawa.
4  * Author: Shree Devi Kumar
5  * Created: August 03, 2018
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  * http://www.apache.org/licenses/LICENSE-2.0
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include "validate_javanese.h"
20 #include "errcode.h"
21 #include "tprintf.h"
22 
23 namespace tesseract {
24 
25 // Returns whether codes matches the pattern for a Javanese Grapheme.
26 // Taken from unicode standard:
27 // http://www.unicode.org/charts/PDF/UA980.pdf
28 // http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
29 // The Consonant class here includes independent vowels.
30 // The order of components in an orthographic syllable as expressed in BNF is:
31 // {C F} C {{R}Y} {V{A}} {Z}
32 // Translated to the codes used by the CharClass enum:
33 // [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v]
34 // Also see https://r12a.github.io/scripts/javanese/ for detailed notes.
35 // Validation rules copied from validate_indic.cpp and modified for Javanese.
36 // Indic - for reference
37 // + vowel Grapheme: V[D](v)*
38 // + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
39 
41  switch (codes_[codes_used_].first) {
43  return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
44  case CharClass::kVowel:
46  return ConsumeVowelIfValid();
49  // Apart from within an aksara, joiners are silently dropped.
50  if (report_errors_)
51  tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
52  ++codes_used_;
53  return true;
54  case CharClass::kOther:
55  UseMultiCode(1);
56  return true;
57  default:
58  if (report_errors_) {
59  tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
60  codes_[codes_used_].first, codes_[codes_used_].second);
61  }
62  return false;
63  }
64 }
65 
66 // Helper consumes/copies a virama and any associated post-virama joiners.
67 // A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
68 // no joiner at all) must be followed by a consonant.
69 // A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
70 // consonant, space, or character from a different script. We clean up the
71 // representation to make it consistent by adding a ZWNJ if missing from a
72 // non-linking virama. Returns false with an invalid sequence.
73 bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
74  int num_codes = codes_.size();
75  if (joiner.first == CharClass::kOther) {
77  if (codes_used_ < num_codes &&
78  codes_[codes_used_].second == kZeroWidthJoiner) {
79  // Post-matra viramas must be explicit, so no joiners allowed here.
80  if (post_matra) {
81  if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
82  return false;
83  }
84  if (codes_used_ + 1 < num_codes &&
85  codes_[codes_used_ - 2].second != kCakra &&
86  (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
87  codes_[codes_used_ + 1].second == kPengkal ||
88  codes_[codes_used_ + 1].second == kCakra)) {
89  // This combination will be picked up later.
91  } else {
92  // Half-form with optional Nukta.
93  int len = output_.size() + 1 - output_used_;
94  if (UseMultiCode(len)) return true;
95  }
96  if (codes_used_ < num_codes &&
98  if (output_used_ == output_.size() ||
99  output_[output_used_] != kCakra) {
100  if (report_errors_) {
101  tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n",
102  static_cast<int>(script_));
103  }
104  return false;
105  }
106  // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
107  if (UseMultiCode(4)) return true;
108  }
109  } else if (codes_used_ == num_codes ||
111  post_matra) {
112  if (codes_used_ == num_codes ||
114  // It is valid to have an unterminated virama at the end of a word, but
115  // for consistency, we will always add ZWNJ if not present.
117  } else {
119  }
120  // Explicit virama [H z]
121  MultiCodePart(2);
122  }
123  } else {
124  // Pre-virama joiner [{Z|z} H] requests specific conjunct.
125  if (UseMultiCode(2)) {
126  if (report_errors_)
127  tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
128  return false;
129  }
130  if (codes_[codes_used_].second == kZeroWidthJoiner ||
132  if (report_errors_) {
133  tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
134  codes_[codes_used_].second);
135  }
136  return false;
137  }
138  }
139  // It is good so far as it goes.
140  return true;
141 }
142 
143 // Helper consumes/copies a series of consonants separated by viramas while
144 // valid, but not any vowel or other modifiers.
145 bool ValidateJavanese::ConsumeConsonantHeadIfValid() {
146  const int num_codes = codes_.size();
147  // Consonant aksara
148  do {
150  // Special Sinhala case of [H Z Yayana/Rayana].
151  int index = output_.size() - 3;
152  if (output_used_ <= index &&
153  (output_.back() == kPengkal || output_.back() == kCakra) &&
154  IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
155  MultiCodePart(3);
156  }
157  bool have_nukta = false;
158  if (codes_used_ < num_codes &&
160  have_nukta = true;
162  }
163  // Test for subscript conjunct.
164  index = output_.size() - 2 - have_nukta;
165  if (output_used_ <= index && IsSubscriptScript() &&
166  IsVirama(output_[index])) {
167  // Output previous virama, consonant + optional nukta.
168  MultiCodePart(2 + have_nukta);
169  }
170  IndicPair joiner(CharClass::kOther, 0);
171  if (codes_used_ < num_codes &&
172  (codes_[codes_used_].second == kZeroWidthJoiner ||
173  (codes_[codes_used_].second == kZeroWidthNonJoiner &&
175  joiner = codes_[codes_used_];
176  if (++codes_used_ == num_codes) {
177  if (report_errors_) {
178  tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
179  joiner.second);
180  }
181  return true;
182  }
183  if (codes_[codes_used_].first == CharClass::kVirama) {
184  output_.push_back(joiner.second);
185  } else {
186  if (report_errors_) {
187  tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
188  output_.back(), joiner.second, codes_[codes_used_].second);
189  }
190  joiner = std::make_pair(CharClass::kOther, 0);
191  }
192  }
193  if (codes_used_ < num_codes &&
195  if (!ConsumeViramaIfValid(joiner, false)) return false;
196  } else {
197  break; // No virama, so the run of consonants is over.
198  }
199  } while (codes_used_ < num_codes &&
201  if (output_used_ < output_.size()) MultiCodePart(1);
202  return true;
203 }
204 
205 // Helper consumes/copies a tail part of a consonant, comprising optional
206 // matra/piece, vowel modifier, vedic mark, terminating virama.
207 bool ValidateJavanese::ConsumeConsonantTailIfValid() {
208  if (codes_used_ == codes_.size()) return true;
209  // No virama: Finish the grapheme.
210  // Are multiple matras allowed?
211  if (codes_[codes_used_].first == CharClass::kMatra) {
212  if (UseMultiCode(1)) return true;
213  if (codes_[codes_used_].first == CharClass::kMatraPiece) {
214  if (UseMultiCode(1)) return true;
215  }
216  }
217  // Tarung also used for long versions of u and o vowels and vocalic r
218  // Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ
219  while (codes_[codes_used_].first == CharClass::kMatraPiece) {
220  if (UseMultiCode(1)) return true;
221  }
222  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
223  if (UseMultiCode(1)) return true;
224  // Only Malayalam allows only repeated 0xd02.
225  if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break;
226  }
227  while (codes_[codes_used_].first == CharClass::kVedicMark) {
228  if (UseMultiCode(1)) return true;
229  }
230  if (codes_[codes_used_].first == CharClass::kVirama) {
231  if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
232  return false;
233  }
234  }
235  // What we have consumed so far is a valid consonant cluster.
236  if (output_used_ < output_.size()) MultiCodePart(1);
237 
238  return true;
239 }
240 
241 // Helper consumes/copies a vowel and optional modifiers.
242 bool ValidateJavanese::ConsumeVowelIfValid() {
243  if (UseMultiCode(1)) return true;
244  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
245  if (UseMultiCode(1)) return true;
246  // Only Malayalam allows repeated modifiers?
247  if (script_ != ViramaScript::kMalayalam) break;
248  }
249  while (codes_[codes_used_].first == CharClass::kVedicMark) {
250  if (UseMultiCode(1)) return true;
251  }
252  // What we have consumed so far is a valid vowel cluster.
253  return true;
254 }
255 
256 
260  // Offset from the start of the relevant unicode code block aka code page.
261  int off = ch - static_cast<char32>(script_);
262  // Anything in another code block is other.
263  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
264  if (off < 0x4) return CharClass::kVowelModifier;
265  if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
266  if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
267  if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
268  if (off <= 0x39) return CharClass::kMatra;
269  if (off <= 0x3a) return CharClass::kConsonant; // A9BA TALING - pre base vowel
270  if (off <= 0x3d) return CharClass::kMatra;
271  if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
272  if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
273  return CharClass::kOther;
274 }
275 
276 } // namespace tesseract
std::vector< IndicPair > codes_
Definition: validator.h:232
static const char32 kZeroWidthNonJoiner
Definition: validator.h:96
bool IsSubscriptScript() const
Definition: validator.cpp:198
signed int char32
Definition: unichar.h:52
void MultiCodePart(int length)
Definition: validator.h:182
static const int kIndicCodePageSize
Definition: validator.h:214
ViramaScript script_
Definition: validator.h:230
static bool IsVirama(char32 unicode)
Definition: validator.cpp:180
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool UseMultiCode(int length)
Definition: validator.h:196
static const char32 kZeroWidthJoiner
Definition: validator.h:97
Validator::CharClass UnicodeToCharClass(char32 ch) const override
bool ConsumeGraphemeIfValid() override
std::vector< char32 > output_
Definition: validator.h:236
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:134
#define ASSERT_HOST(x)
Definition: errcode.h:84