tesseract
5.0.0-alpha-619-ge9db
validate_khmer.cpp
Go to the documentation of this file.
1
#include "
validate_khmer.h
"
2
#include "
errcode.h
"
3
#include "
tprintf.h
"
4
5
namespace
tesseract
{
6
7
// Returns whether codes matches the pattern for a Khmer Grapheme.
8
// Taken from unicode standard:
9
// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
10
// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
11
// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
12
// Translated to the codes used by the CharClass enum:
13
// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
14
// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
15
// Also the Consonant class here includes independent vowels, as they are
16
// treated the same anyway.
17
// In the split grapheme mode, the only characters that get grouped are the
18
// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
19
// the BNF syntax, so who knows what they do.
20
bool
ValidateKhmer::ConsumeGraphemeIfValid
() {
21
const
unsigned
num_codes =
codes_
.size();
22
if
(
codes_used_
== num_codes)
return
false
;
23
if
(
codes_
[
codes_used_
].first ==
CharClass::kOther
) {
24
UseMultiCode
(1);
25
return
true
;
26
}
27
if
(
codes_
[
codes_used_
].first !=
CharClass::kConsonant
) {
28
if
(
report_errors_
) {
29
tprintf
(
"Invalid start of Khmer syllable:0x%x\n"
,
30
codes_
[
codes_used_
].second);
31
}
32
return
false
;
33
}
34
if
(
UseMultiCode
(1))
return
true
;
35
if
(
codes_
[
codes_used_
].first ==
CharClass::kRobat
||
36
codes_
[
codes_used_
].first ==
CharClass::kNukta
) {
37
if
(
UseMultiCode
(1))
return
true
;
38
}
39
while
(
codes_used_
+ 1 < num_codes &&
40
codes_
[
codes_used_
].first ==
CharClass::kVirama
&&
41
codes_
[
codes_used_
+ 1].first ==
CharClass::kConsonant
) {
42
ASSERT_HOST
(!
CodeOnlyToOutput
());
43
if
(
UseMultiCode
(2))
return
true
;
44
if
(
codes_
[
codes_used_
].first ==
CharClass::kRobat
) {
45
if
(
UseMultiCode
(1))
return
true
;
46
}
47
}
48
unsigned
num_matra_parts = 0;
49
if
(
codes_
[
codes_used_
].second ==
kZeroWidthJoiner
||
50
codes_
[
codes_used_
].second ==
kZeroWidthNonJoiner
) {
51
if
(
CodeOnlyToOutput
()) {
52
if
(
report_errors_
) {
53
tprintf
(
"Unterminated joiner: 0x%x\n"
,
output_
.back());
54
}
55
return
false
;
56
}
57
++num_matra_parts;
58
}
59
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
60
// own or as an addition to other matras.
61
if
(
codes_
[
codes_used_
].first ==
CharClass::kMatra
||
62
codes_
[
codes_used_
].first ==
CharClass::kMatraPiece
) {
63
++num_matra_parts;
64
if
(
UseMultiCode
(num_matra_parts))
return
true
;
65
}
else
if
(num_matra_parts) {
66
if
(
report_errors_
) {
67
tprintf
(
"Joiner with non-dependent vowel after it!:0x%x 0x%x\n"
,
68
output_
.back(),
codes_
[
codes_used_
].second);
69
}
70
return
false
;
71
}
72
if
(
codes_
[
codes_used_
].first ==
CharClass::kMatraPiece
&&
73
codes_
[
codes_used_
- 1].first !=
CharClass::kMatraPiece
) {
74
if
(
UseMultiCode
(1))
return
true
;
75
}
76
if
(
codes_
[
codes_used_
].first ==
CharClass::kVowelModifier
) {
77
if
(
UseMultiCode
(1))
return
true
;
78
}
79
if
(
codes_used_
+ 1 < num_codes &&
80
codes_
[
codes_used_
].first ==
CharClass::kVirama
&&
81
codes_
[
codes_used_
+ 1].first ==
CharClass::kConsonant
) {
82
ASSERT_HOST
(!
CodeOnlyToOutput
());
83
if
(
UseMultiCode
(2))
return
true
;
84
}
85
return
true
;
86
}
87
88
Validator::CharClass
ValidateKhmer::UnicodeToCharClass
(
char32
ch)
const
{
89
if
(
IsVedicAccent
(ch))
return
CharClass::kVedicMark
;
90
if
(ch ==
kZeroWidthNonJoiner
)
return
CharClass::kZeroWidthNonJoiner
;
91
if
(ch ==
kZeroWidthJoiner
)
return
CharClass::kZeroWidthJoiner
;
92
// Offset from the start of the relevant unicode code block aka code page.
93
int
off = ch - static_cast<char32>(
script_
);
94
// Anything in another code block is other.
95
if
(off < 0 || off >=
kIndicCodePageSize
)
return
CharClass::kOther
;
96
if
(off <= 0x33)
return
CharClass::kConsonant
;
97
if
(off <= 0x45)
return
CharClass::kMatra
;
98
if
(off == 0x46)
return
CharClass::kMatraPiece
;
99
if
(off == 0x4c)
return
CharClass::kRobat
;
100
if
(off == 0x49 || off == 0x4a)
return
CharClass::kNukta
;
101
if
(off <= 0x51)
return
CharClass::kVowelModifier
;
102
if
(off == 0x52)
return
CharClass::kVirama
;
103
return
CharClass::kOther
;
104
}
105
106
}
// namespace tesseract
tesseract::Validator::CharClass::kZeroWidthJoiner
validate_khmer.h
tesseract::Validator::CharClass::kVowelModifier
tesseract::Validator::IsVedicAccent
static bool IsVedicAccent(char32 unicode)
Definition:
validator.cpp:191
tesseract::Validator::UseMultiCode
bool UseMultiCode(unsigned length)
Definition:
validator.h:210
ASSERT_HOST
#define ASSERT_HOST(x)
Definition:
errcode.h:87
tesseract::Validator::CharClass::kMatra
tesseract::Validator::codes_
std::vector< IndicPair > codes_
Definition:
validator.h:246
tesseract::Validator::kIndicCodePageSize
static const int kIndicCodePageSize
Definition:
validator.h:228
tesseract::ValidateKhmer::UnicodeToCharClass
CharClass UnicodeToCharClass(char32 ch) const override
Definition:
validate_khmer.cpp:88
tesseract::Validator::kZeroWidthNonJoiner
static const char32 kZeroWidthNonJoiner
Definition:
validator.h:110
tesseract::Validator::CharClass
CharClass
Definition:
validator.h:126
tesseract::Validator::output_
std::vector< char32 > output_
Definition:
validator.h:250
tesseract::char32
signed int char32
Definition:
unichar.h:53
tesseract::ValidateKhmer::ConsumeGraphemeIfValid
bool ConsumeGraphemeIfValid() override
Definition:
validate_khmer.cpp:20
tesseract::Validator::kZeroWidthJoiner
static const char32 kZeroWidthJoiner
Definition:
validator.h:111
tesseract::Validator::report_errors_
bool report_errors_
Definition:
validator.h:256
tesseract
Definition:
baseapi.h:65
tesseract::Validator::CharClass::kVedicMark
tprintf.h
tesseract::Validator::codes_used_
unsigned codes_used_
Definition:
validator.h:252
tesseract::Validator::CharClass::kVirama
tesseract::Validator::CodeOnlyToOutput
bool CodeOnlyToOutput()
Definition:
validator.h:186
tesseract::Validator::CharClass::kConsonant
tesseract::Validator::CharClass::kNukta
tprintf
DLLSYM void tprintf(const char *format,...)
Definition:
tprintf.cpp:34
errcode.h
tesseract::Validator::CharClass::kRobat
tesseract::Validator::CharClass::kOther
tesseract::Validator::script_
ViramaScript script_
Definition:
validator.h:244
tesseract::Validator::CharClass::kZeroWidthNonJoiner
tesseract::Validator::CharClass::kMatraPiece
src
training
validate_khmer.cpp
Generated on Thu Jan 30 2020 14:22:21 for tesseract by
1.8.16