tesseract  4.0.0-1-g2a2b
validator.cpp
Go to the documentation of this file.
1 #include "validator.h"
2 
3 #include <algorithm>
4 #include <unordered_map>
5 #include <vector>
6 #include <iterator>
7 
8 #include "icuerrorcode.h"
9 #include "unicode/uchar.h" // From libicu
10 #include "unicode/uscript.h" // From libicu
11 #include "validate_grapheme.h"
12 #include "validate_indic.h"
13 #include "validate_javanese.h"
14 #include "validate_khmer.h"
15 #include "validate_myanmar.h"
16 
17 namespace tesseract {
18 
19 // Some specific but universally useful unicodes.
20 const char32 Validator::kZeroWidthSpace = 0x200B;
22 const char32 Validator::kZeroWidthJoiner = 0x200D;
23 const char32 Validator::kLeftToRightMark = 0x200E;
24 const char32 Validator::kRightToLeftMark = 0x200F;
25 const char32 Validator::kInvalid = 0xfffd;
26 
27 // Destructor.
28 // It is defined here, so the compiler can create a single vtable
29 // instead of weak vtables in every compilation unit.
30 Validator::~Validator() = default;
31 
32 // Validates and cleans the src vector of unicodes to the *dest, according to
33 // g_mode. In the case of kSingleString, a single vector containing the whole
34 // result is added to *dest. With kCombined, multiple vectors are added to
35 // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
36 // added to *dest with a smaller unit representing a glyph in each.
37 // In case of validation error, returns false and as much as possible of the
38 // input, without discarding invalid text.
39 /* static */
41  GraphemeNormMode g_mode, bool report_errors, const std::vector<char32>& src,
42  std::vector<std::vector<char32>>* dest) {
43  ValidateGrapheme g_validator(ViramaScript::kNonVirama, report_errors);
44  std::vector<std::vector<char32>> graphemes;
46  bool success = true;
47  if (script == ViramaScript::kNonVirama) {
48  // The grapheme segmenter's maximum segmentation is the grapheme unit, so
49  // up the mode by 1 to get the desired effect.
50  if (g_mode == GraphemeNormMode::kCombined)
52  else if (g_mode == GraphemeNormMode::kGlyphSplit)
54  // Just do grapheme segmentation.
55  success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src, dest);
56  } else {
57  success = g_validator.ValidateCleanAndSegmentInternal(
58  GraphemeNormMode::kGlyphSplit, src, &graphemes);
59  std::unique_ptr<Validator> validator(
60  ScriptValidator(script, report_errors));
61  for (const auto& grapheme : graphemes) {
62  if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) {
63  success = false;
64  }
65  }
66  }
67  return success;
68 }
69 
70 // Factory method that understands how to map script to the right subclass.
71 std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script,
72  bool report_errors) {
73  switch (script) {
75  return std::unique_ptr<Validator>(
76  new ValidateGrapheme(script, report_errors));
78  return std::unique_ptr<Validator>(
79  new ValidateJavanese(script, report_errors));
81  return std::unique_ptr<Validator>(
82  new ValidateMyanmar(script, report_errors));
84  return std::unique_ptr<Validator>(
85  new ValidateKhmer(script, report_errors));
86  default:
87  return std::unique_ptr<Validator>(
88  new ValidateIndic(script, report_errors));
89  }
90 }
91 
92 // Internal version of the public static ValidateCleanAndSegment.
93 // Validates and cleans the src vector of unicodes to the *dest, according to
94 // its type and the given g_mode.
95 // In case of validation error, returns false and returns as much as possible
96 // of the input, without discarding invalid text.
98  GraphemeNormMode g_mode, const std::vector<char32>& src,
99  std::vector<std::vector<char32>>* dest) {
100  Clear();
101  ComputeClassCodes(src);
102  bool success = true;
103  for (codes_used_ = 0; codes_used_ < codes_.size();) {
104  if (!ConsumeGraphemeIfValid()) {
105  success = false;
106  ++codes_used_;
107  }
108  }
109  MoveResultsToDest(g_mode, dest);
110  return success;
111 }
112 
113 // Moves the results from parts_ or output_ to dest according to g_mode.
115  std::vector<std::vector<char32>>* dest) {
117  // Append each element of the combined output_ that we made as a new vector
118  // in dest.
119  dest->reserve(dest->size() + output_.size());
120  for (char32 ch : output_) dest->push_back({ch});
121  } else if (g_mode == GraphemeNormMode::kGlyphSplit) {
122  // Append all the parts_ that we made onto dest.
123  std::move(parts_.begin(), parts_.end(), std::back_inserter(*dest));
124  } else if (g_mode == GraphemeNormMode::kCombined || dest->empty()) {
125  // Append the combined output_ that we made onto dest as one new vector.
126  dest->push_back(std::vector<char32>());
127  output_.swap(dest->back());
128  } else { // kNone.
129  // Append the combined output_ that we made onto the last existing element
130  // of dest.
131  dest->back().insert(dest->back().end(), output_.begin(), output_.end());
132  }
133 }
134 
135 static bool CmpPairSecond(const std::pair<int, int>& p1,
136  const std::pair<int, int>& p2) {
137  return p1.second < p2.second;
138 }
139 
140 // Computes and returns the ViramaScript corresponding to the most frequent
141 // virama-using script in the input, or kNonVirama if none are present.
142 /* static */
144  const std::vector<char32>& utf32) {
145  std::unordered_map<int, int> histogram;
146  for (char32 ch : utf32) {
147  // Determine the codepage base. For the Indic scripts, Khmer and Javanese, it is
148  // sufficient to divide by kIndicCodePageSize but Myanmar is all over the
149  // unicode code space, so use its script id.
150  int base = ch / kIndicCodePageSize;
151  IcuErrorCode err;
152  UScriptCode script_code = uscript_getScript(ch, err);
153  if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode &&
154  script_code != USCRIPT_COMMON) ||
155  script_code == USCRIPT_MYANMAR) {
156  if (script_code == USCRIPT_MYANMAR)
157  base = static_cast<char32>(ViramaScript::kMyanmar) / kIndicCodePageSize;
158  ++histogram[base];
159  }
160  }
161  if (!histogram.empty()) {
162  int base =
163  std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)
164  ->first;
165  char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
166  // Check for validity.
167  if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
168  codebase == static_cast<char32>(ViramaScript::kJavanese) ||
169  codebase == static_cast<char32>(ViramaScript::kKhmer) ||
170  (static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
171  codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
172  return static_cast<ViramaScript>(codebase);
173  }
174  }
176 }
177 
178 // Returns true if the given UTF-32 unicode is a "virama" character.
179 /* static */
181  return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
182  (unicode & 0x7f) == 0x4d) ||
183  unicode == kSinhalaVirama ||
184  unicode == kJavaneseVirama ||
185  unicode == kMyanmarVirama ||
186  unicode == kKhmerVirama;
187 }
188 
189 // Returns true if the given UTF-32 unicode is a vedic accent.
190 /* static */
192  return (0x1cd0 <= unicode && unicode < 0x1d00) ||
193  (0xa8e0 <= unicode && unicode <= 0xa8f7) ||
194  (0x951 <= unicode && unicode <= 0x954);
195 }
196 
197 // Returns true if the script is one that uses subscripts for conjuncts.
199  return script_ == ViramaScript::kTelugu ||
204 }
205 
206 void Validator::ComputeClassCodes(const std::vector<char32>& text) {
207  codes_.reserve(text.size());
208  for (char32 c : text) {
209  codes_.push_back(std::make_pair(UnicodeToCharClass(c), c));
210  }
211 }
212 
213 // Resets to the initial state.
215  codes_.clear();
216  parts_.clear();
217  output_.clear();
218  codes_used_ = 0;
219  output_used_ = 0;
220 }
221 
222 } // namespace tesseract
std::vector< IndicPair > codes_
Definition: validator.h:232
signed int char32
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:191
static const char32 kInvalid
Definition: validator.h:100
static const char32 kZeroWidthNonJoiner
Definition: validator.h:96
bool IsSubscriptScript() const
Definition: validator.cpp:198
static const char32 kMinIndicUnicode
Definition: validator.h:216
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
Definition: validator.cpp:114
static const char32 kMaxJavaneseUnicode
Definition: validator.h:227
signed int char32
Definition: unichar.h:52
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
Definition: validator.cpp:40
static const int kIndicCodePageSize
Definition: validator.h:214
static const char32 kMyanmarVirama
Definition: validator.h:223
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
Definition: validator.cpp:97
static const char32 kJavaneseVirama
Definition: validator.h:226
static const char32 kMaxSinhalaUnicode
Definition: validator.h:218
virtual CharClass UnicodeToCharClass(char32 ch) const =0
ViramaScript script_
Definition: validator.h:230
static bool IsVirama(char32 unicode)
Definition: validator.cpp:180
static const char32 kKhmerVirama
Definition: validator.h:224
GraphemeNormMode
Definition: validator.h:34
static const char32 kZeroWidthSpace
Definition: validator.h:95
static const char32 kLeftToRightMark
Definition: validator.h:98
static const char32 kZeroWidthJoiner
Definition: validator.h:97
void ComputeClassCodes(const std::vector< char32 > &text)
Definition: validator.cpp:206
virtual bool ConsumeGraphemeIfValid()=0
std::vector< char32 > output_
Definition: validator.h:236
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
Definition: validator.cpp:71
std::vector< std::vector< char32 > > parts_
Definition: validator.h:234
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
Definition: validator.cpp:143
static const char32 kSinhalaVirama
Definition: validator.h:222
static const char32 kRightToLeftMark
Definition: validator.h:99