tesseract  4.0.0-1-g2a2b
validate_khmer.cpp
Go to the documentation of this file.
1 #include "validate_khmer.h"
2 #include "errcode.h"
3 #include "tprintf.h"
4 
5 namespace tesseract {
6 
7 // Returns whether codes matches the pattern for a Khmer Grapheme.
8 // Taken from unicode standard:
9 // http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
10 // where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
11 // to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
12 // Translated to the codes used by the CharClass enum:
13 // C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
14 // Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
15 // Also the Consonant class here includes independent vowels, as they are
16 // treated the same anyway.
17 // In the split grapheme mode, the only characters that get grouped are the
18 // HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
19 // the BNF syntax, so who knows what they do.
21  int num_codes = codes_.size();
22  if (codes_used_ == num_codes) return false;
23  if (codes_[codes_used_].first == CharClass::kOther) {
24  UseMultiCode(1);
25  return true;
26  }
28  if (report_errors_) {
29  tprintf("Invalid start of Khmer syllable:0x%x\n",
30  codes_[codes_used_].second);
31  }
32  return false;
33  }
34  if (UseMultiCode(1)) return true;
35  if (codes_[codes_used_].first == CharClass::kRobat ||
37  if (UseMultiCode(1)) return true;
38  }
39  while (codes_used_ + 1 < num_codes &&
43  if (UseMultiCode(2)) return true;
44  if (codes_[codes_used_].first == CharClass::kRobat) {
45  if (UseMultiCode(1)) return true;
46  }
47  }
48  int num_matra_parts = 0;
49  if (codes_[codes_used_].second == kZeroWidthJoiner ||
51  if (CodeOnlyToOutput()) {
52  if (report_errors_) {
53  tprintf("Unterminated joiner: 0x%x\n", output_.back());
54  }
55  return false;
56  }
57  ++num_matra_parts;
58  }
59  // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
60  // own or as an addition to other matras.
61  if (codes_[codes_used_].first == CharClass::kMatra ||
63  ++num_matra_parts;
64  if (UseMultiCode(num_matra_parts)) return true;
65  } else if (num_matra_parts) {
66  if (report_errors_) {
67  tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
68  output_.back(), codes_[codes_used_].second);
69  }
70  return false;
71  }
74  if (UseMultiCode(1)) return true;
75  }
77  if (UseMultiCode(1)) return true;
78  }
79  if (codes_used_ + 1 < num_codes &&
83  if (UseMultiCode(2)) return true;
84  }
85  return true;
86 }
87 
89  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
92  // Offset from the start of the relevant unicode code block aka code page.
93  int off = ch - static_cast<char32>(script_);
94  // Anything in another code block is other.
95  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
96  if (off <= 0x33) return CharClass::kConsonant;
97  if (off <= 0x45) return CharClass::kMatra;
98  if (off == 0x46) return CharClass::kMatraPiece;
99  if (off == 0x4c) return CharClass::kRobat;
100  if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
101  if (off <= 0x51) return CharClass::kVowelModifier;
102  if (off == 0x52) return CharClass::kVirama;
103  return CharClass::kOther;
104 }
105 
106 } // namespace tesseract
std::vector< IndicPair > codes_
Definition: validator.h:232
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:191
static const char32 kZeroWidthNonJoiner
Definition: validator.h:96
signed int char32
Definition: unichar.h:52
static const int kIndicCodePageSize
Definition: validator.h:214
ViramaScript script_
Definition: validator.h:230
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool UseMultiCode(int length)
Definition: validator.h:196
static const char32 kZeroWidthJoiner
Definition: validator.h:97
std::vector< char32 > output_
Definition: validator.h:236
bool ConsumeGraphemeIfValid() override
CharClass UnicodeToCharClass(char32 ch) const override
#define ASSERT_HOST(x)
Definition: errcode.h:84