24 #include <unordered_map> 30 #include "unicode/normalizer2.h" 31 #include "unicode/translit.h" 32 #include "unicode/uchar.h" 33 #include "unicode/unorm2.h" 34 #include "unicode/uscript.h" 38 static bool is_hyphen_punc(
const char32 ch) {
39 static const int kNumHyphenPuncUnicodes = 13;
40 static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
41 '-', 0x2010, 0x2011, 0x2012,
42 0x2013, 0x2014, 0x2015,
50 for (
int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
51 if (kHyphenPuncUnicodes[i] == ch)
return true;
56 static bool is_single_quote(
const char32 ch) {
57 static const int kNumSingleQuoteUnicodes = 8;
58 static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
68 for (
int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
69 if (kSingleQuoteUnicodes[i] == ch)
return true;
74 static bool is_double_quote(
const char32 ch) {
75 static const int kNumDoubleQuoteUnicodes = 8;
76 static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
87 for (
int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
88 if (kDoubleQuoteUnicodes[i] == ch)
return true;
97 std::vector<char32>* normed32) {
99 icu::UnicodeString uch_str(str8,
"UTF-8");
100 IcuErrorCode error_code;
102 const char* norm_type =
106 UNormalization2Mode compose =
111 const icu::Normalizer2* normalizer =
112 icu::Normalizer2::getInstance(
nullptr, norm_type, compose, error_code);
113 error_code.assertSuccess();
115 icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);
116 error_code.assertSuccess();
118 normed32->reserve(norm_str.length());
119 for (
int offset = 0; offset < norm_str.length();
120 offset = norm_str.moveIndex32(offset, 1)) {
121 char32 ch = norm_str.char32At(offset);
125 normed32->push_back(ch);
130 static void StripJoiners(std::vector<char32>* str32) {
131 for (
char32 ch : *str32) {
132 if (u_isalpha(ch))
return;
135 for (
char32 ch : *str32) {
138 (*str32)[len++] = ch;
149 std::string* normalized) {
150 std::vector<char32> normed32;
151 NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
153 StripJoiners(&normed32);
154 std::vector<std::vector<char32>> graphemes;
157 if (graphemes.empty() || graphemes[0].empty()) {
159 }
else if (normalized !=
nullptr) {
175 std::vector<std::string>* graphemes) {
176 std::vector<char32> normed32;
177 NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
178 StripJoiners(&normed32);
179 std::vector<std::vector<char32>> graphemes32;
181 normed32, &graphemes32);
185 std::vector<char32> cleaned32;
186 for (
const auto& g : graphemes32) {
187 cleaned32.insert(cleaned32.end(), g.begin(), g.end());
189 if (cleaned32 != normed32) {
192 cleaned32, &graphemes32);
196 graphemes->reserve(graphemes32.size());
197 for (
const auto& grapheme : graphemes32) {
205 if (is_hyphen_punc(ch))
207 else if (is_single_quote(ch))
209 else if (is_double_quote(ch))
220 return (static_cast<uint32_t>(ch) < 0xD800) || (ch >= 0xE000 && ch <= 0x10FFFF);
226 return u_isUWhiteSpace(static_cast<UChar32>(ch));
238 n_white += it.utf8_len();
248 n_notwhite += it.utf8_len();
255 !(ch >= 0xFDD0 && ch <= 0xFDEF) &&
256 !(ch >= 0xFFFE && ch <= 0xFFFF) && !(ch >= 0x1FFFE && ch <= 0x1FFFF) &&
257 !(ch >= 0x2FFFE && ch <= 0x2FFFF) &&
258 !(ch >= 0x3FFFE && ch <= 0x3FFFF) &&
259 !(ch >= 0x4FFFE && ch <= 0x4FFFF) &&
260 !(ch >= 0x5FFFE && ch <= 0x5FFFF) &&
261 !(ch >= 0x6FFFE && ch <= 0x6FFFF) &&
262 !(ch >= 0x7FFFE && ch <= 0x7FFFF) &&
263 !(ch >= 0x8FFFE && ch <= 0x8FFFF) &&
264 !(ch >= 0x9FFFE && ch <= 0x9FFFF) &&
265 !(ch >= 0xAFFFE && ch <= 0xAFFFF) &&
266 !(ch >= 0xBFFFE && ch <= 0xBFFFF) &&
267 !(ch >= 0xCFFFE && ch <= 0xCFFFF) &&
268 !(ch >= 0xDFFFE && ch <= 0xDFFFF) &&
269 !(ch >= 0xEFFFE && ch <= 0xEFFFF) &&
270 !(ch >= 0xFFFFE && ch <= 0xFFFFF) &&
271 !(ch >= 0x10FFFE && ch <= 0x10FFFF) &&
272 (!u_isISOControl(static_cast<UChar32>(ch)) || ch ==
'\n' ||
273 ch ==
'\f' || ch ==
'\t' || ch ==
'\r');
278 (!u_isISOControl(static_cast<UChar32>(ch)) || ch ==
'\n' ||
279 ch ==
'\f' || ch ==
'\t' || ch ==
'\r');
285 if (ch != 0x3000)
return ch;
288 if (ch == 0xFF5F)
return 0x2985;
289 if (ch == 0xFF60)
return 0x2986;
292 icu::UnicodeString uch_str(static_cast<UChar32>(ch));
293 const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
294 "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
295 error_code.assertSuccess();
298 fulltohalf->transliterate(uch_str);
static const char32 kZeroWidthNonJoiner
static bool IsZeroWidthMark(char32 ch)
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
char32 OCRNormalize(char32 ch)
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
bool IsInterchangeValid(const char32 ch)
unsigned int SpanUTF8Whitespace(const char *text)
bool IsInterchangeValid7BitAscii(const char32 ch)
bool IsOCREquivalent(char32 ch1, char32 ch2)
bool IsWhitespace(const char32 ch)
bool IsUTF8Whitespace(const char *text)
bool IsValidCodepoint(const char32 ch)
static const_iterator begin(const char *utf8_str, const int byte_length)
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
#define ASSERT_HOST_MSG(x,...)
char32 FullwidthToHalfwidth(const char32 ch)
static const_iterator end(const char *utf8_str, const int byte_length)
static const char32 kZeroWidthJoiner
unsigned int SpanUTF8NotWhitespace(const char *text)
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)