4 #include <unordered_map> 9 #include "unicode/uchar.h" 10 #include "unicode/uscript.h" 42 std::vector<std::vector<char32>>* dest) {
44 std::vector<std::vector<char32>> graphemes;
59 std::unique_ptr<Validator> validator(
61 for (
const auto& grapheme : graphemes) {
62 if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) {
75 return std::unique_ptr<Validator>(
78 return std::unique_ptr<Validator>(
81 return std::unique_ptr<Validator>(
84 return std::unique_ptr<Validator>(
87 return std::unique_ptr<Validator>(
99 std::vector<std::vector<char32>>* dest) {
115 std::vector<std::vector<char32>>* dest) {
119 dest->reserve(dest->size() +
output_.size());
123 std::move(
parts_.begin(),
parts_.end(), std::back_inserter(*dest));
126 dest->push_back(std::vector<char32>());
131 dest->back().insert(dest->back().end(),
output_.begin(),
output_.end());
135 static bool CmpPairSecond(
const std::pair<int, int>& p1,
136 const std::pair<int, int>& p2) {
137 return p1.second < p2.second;
144 const std::vector<char32>& utf32) {
145 std::unordered_map<int, int> histogram;
152 UScriptCode script_code = uscript_getScript(ch, err);
154 script_code != USCRIPT_COMMON) ||
155 script_code == USCRIPT_MYANMAR) {
156 if (script_code == USCRIPT_MYANMAR)
161 if (!histogram.empty()) {
163 std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)
182 (unicode & 0x7f) == 0x4d) ||
192 return (0x1cd0 <= unicode && unicode < 0x1d00) ||
193 (0xa8e0 <= unicode && unicode <= 0xa8f7) ||
194 (0x951 <= unicode && unicode <= 0x954);
207 codes_.reserve(text.size());
std::vector< IndicPair > codes_
static bool IsVedicAccent(char32 unicode)
static const char32 kInvalid
static const char32 kZeroWidthNonJoiner
bool IsSubscriptScript() const
static const char32 kMinIndicUnicode
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
static const char32 kMaxJavaneseUnicode
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
static const int kIndicCodePageSize
static const char32 kMyanmarVirama
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
static const char32 kJavaneseVirama
static const char32 kMaxSinhalaUnicode
virtual CharClass UnicodeToCharClass(char32 ch) const =0
static bool IsVirama(char32 unicode)
static const char32 kKhmerVirama
static const char32 kZeroWidthSpace
static const char32 kLeftToRightMark
static const char32 kZeroWidthJoiner
void ComputeClassCodes(const std::vector< char32 > &text)
virtual bool ConsumeGraphemeIfValid()=0
std::vector< char32 > output_
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
std::vector< std::vector< char32 > > parts_
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
static const char32 kSinhalaVirama
static const char32 kRightToLeftMark