24 #include <unordered_map> 
   30 #include "unicode/normalizer2.h"   
   31 #include "unicode/translit.h"      
   32 #include "unicode/uchar.h"         
   33 #include "unicode/unorm2.h"        
   34 #include "unicode/uscript.h"       
   38 static bool is_hyphen_punc(
const char32 ch) {
 
   39   static const int kNumHyphenPuncUnicodes = 13;
 
   40   static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
 
   41       '-',    0x2010, 0x2011, 0x2012,
 
   42       0x2013, 0x2014, 0x2015,  
 
   50   for (
int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
 
   51     if (kHyphenPuncUnicodes[i] == ch) 
return true;
 
   56 static bool is_single_quote(
const char32 ch) {
 
   57   static const int kNumSingleQuoteUnicodes = 8;
 
   58   static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
 
   68   for (
int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
 
   69     if (kSingleQuoteUnicodes[i] == ch) 
return true;
 
   74 static bool is_double_quote(
const char32 ch) {
 
   75   static const int kNumDoubleQuoteUnicodes = 8;
 
   76   static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
 
   87   for (
int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
 
   88     if (kDoubleQuoteUnicodes[i] == ch) 
return true;
 
   97                                  std::vector<char32>* normed32) {
 
   99   icu::UnicodeString uch_str(str8, 
"UTF-8");
 
  100   IcuErrorCode error_code;
 
  102   const char* norm_type =
 
  106   UNormalization2Mode compose =
 
  111   const icu::Normalizer2* normalizer =
 
  112       icu::Normalizer2::getInstance(
nullptr, norm_type, compose, error_code);
 
  113   error_code.assertSuccess();
 
  115   icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);
 
  116   error_code.assertSuccess();
 
  118   normed32->reserve(norm_str.length());  
 
  119   for (
int offset = 0; offset < norm_str.length();
 
  120        offset = norm_str.moveIndex32(offset, 1)) {
 
  121     char32 ch = norm_str.char32At(offset);
 
  125     normed32->push_back(ch);
 
  130 static void StripJoiners(std::vector<char32>* str32) {
 
  131   for (
char32 ch : *str32) {
 
  132     if (u_isalpha(ch)) 
return;
 
  135   for (
char32 ch : *str32) {
 
  138       (*str32)[len++] = ch;
 
  150   std::vector<char32> normed32;
 
  151   NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
 
  153     StripJoiners(&normed32);
 
  154     std::vector<std::vector<char32>> graphemes;
 
  157     if (graphemes.empty() || graphemes[0].empty()) {
 
  159     } 
else if (normalized != 
nullptr) {
 
  175                                   std::vector<std::string>* graphemes) {
 
  176   std::vector<char32> normed32;
 
  177   NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
 
  178   StripJoiners(&normed32);
 
  179   std::vector<std::vector<char32>> graphemes32;
 
  181                                                     normed32, &graphemes32);
 
  185     std::vector<char32> cleaned32;
 
  186     for (
const auto& g : graphemes32) {
 
  187       cleaned32.insert(cleaned32.end(), g.begin(), g.end());
 
  189     if (cleaned32 != normed32) {
 
  192                                                    cleaned32, &graphemes32);
 
  196   graphemes->reserve(graphemes32.size());
 
  197   for (
const auto& grapheme : graphemes32) {
 
  205   if (is_hyphen_punc(ch))
 
  207   else if (is_single_quote(ch))
 
  209   else if (is_double_quote(ch))
 
  220   return (static_cast<uint32_t>(ch) < 0xD800) || (ch >= 0xE000 && ch <= 0x10FFFF);
 
  226   return u_isUWhiteSpace(static_cast<UChar32>(ch));
 
  238     n_white += it.utf8_len();
 
  248     n_notwhite += it.utf8_len();
 
  255          !(ch >= 0xFDD0 && ch <= 0xFDEF) &&  
 
  256          !(ch >= 0xFFFE && ch <= 0xFFFF) && !(ch >= 0x1FFFE && ch <= 0x1FFFF) &&
 
  257          !(ch >= 0x2FFFE && ch <= 0x2FFFF) &&
 
  258          !(ch >= 0x3FFFE && ch <= 0x3FFFF) &&
 
  259          !(ch >= 0x4FFFE && ch <= 0x4FFFF) &&
 
  260          !(ch >= 0x5FFFE && ch <= 0x5FFFF) &&
 
  261          !(ch >= 0x6FFFE && ch <= 0x6FFFF) &&
 
  262          !(ch >= 0x7FFFE && ch <= 0x7FFFF) &&
 
  263          !(ch >= 0x8FFFE && ch <= 0x8FFFF) &&
 
  264          !(ch >= 0x9FFFE && ch <= 0x9FFFF) &&
 
  265          !(ch >= 0xAFFFE && ch <= 0xAFFFF) &&
 
  266          !(ch >= 0xBFFFE && ch <= 0xBFFFF) &&
 
  267          !(ch >= 0xCFFFE && ch <= 0xCFFFF) &&
 
  268          !(ch >= 0xDFFFE && ch <= 0xDFFFF) &&
 
  269          !(ch >= 0xEFFFE && ch <= 0xEFFFF) &&
 
  270          !(ch >= 0xFFFFE && ch <= 0xFFFFF) &&
 
  271          !(ch >= 0x10FFFE && ch <= 0x10FFFF) &&
 
  272          (!u_isISOControl(static_cast<UChar32>(ch)) || ch == 
'\n' ||
 
  273           ch == 
'\f' || ch == 
'\t' || ch == 
'\r');
 
  278          (!u_isISOControl(static_cast<UChar32>(ch)) || ch == 
'\n' ||
 
  279           ch == 
'\f' || ch == 
'\t' || ch == 
'\r');
 
  285     if (ch != 0x3000) 
return ch;
 
  288   if (ch == 0xFF5F) 
return 0x2985;
 
  289   if (ch == 0xFF60) 
return 0x2986;
 
  291   IcuErrorCode error_code;
 
  292   icu::UnicodeString uch_str(static_cast<UChar32>(ch));
 
  293   const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
 
  294       "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
 
  295   error_code.assertSuccess();
 
  298   fulltohalf->transliterate(uch_str);