37 static int RadicalPreHash(
const std::vector<int>& rs) {
39 for (
int radical : rs) {
47 using RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>;
51 static bool DecodeRadicalLine(
STRING* radical_data_line,
RSMap* radical_map) {
52 if (radical_data_line->
length() == 0 || (*radical_data_line)[0] ==
'#')
55 radical_data_line->
split(
' ', &entries);
56 if (entries.
size() < 2)
return false;
58 int unicode = strtol(&entries[0][0], &end, 10);
59 if (*end !=
'\0')
return false;
60 std::unique_ptr<std::vector<int>> radicals(
new std::vector<int>);
61 for (
int i = 1; i < entries.
size(); ++i) {
62 int radical = strtol(&entries[i][0], &end, 10);
63 if (*end !=
'\0')
return false;
64 radicals->push_back(radical);
66 (*radical_map)[unicode] = std::move(radicals);
74 static bool DecodeRadicalTable(
STRING* radical_data,
RSMap* radical_map) {
76 radical_data->
split(
'\n', &lines);
77 for (
int i = 0; i < lines.
size(); ++i) {
78 if (!DecodeRadicalLine(&lines[i], radical_map)) {
79 tprintf(
"Invalid format in radical table at line %d: %s\n", i,
92 encoder_ = src.encoder_;
93 code_range_ = src.code_range_;
103 STRING* radical_stroke_table) {
105 if (radical_stroke_table !=
nullptr &&
106 !DecodeRadicalTable(radical_stroke_table, &radical_map))
121 int hangul_offset = unicharset.
size();
127 int han_offset = hangul_offset + kTotalJamos;
128 for (
int u = 0; u <= unicharset.
size(); ++u) {
131 if (u == unicharset.
size() && u != null_id)
break;
134 std::vector<char32> unicodes;
136 if (u < unicharset.
size())
138 if (u < unicharset.
size() &&
141 int unicode = unicodes[0];
142 int leading, vowel, trailing;
143 auto it = radical_map.find(unicode);
144 if (it != radical_map.end()) {
146 int num_radicals = it->second->size();
147 for (
int c = 0; c < num_radicals; ++c) {
148 code.
Set(c, han_offset + (*it->second)[c]);
150 int pre_hash = RadicalPreHash(*it->second);
151 int num_samples = radical_counts[pre_hash]++;
157 code.
Set3(leading + hangul_offset, vowel +
kLCount + hangul_offset,
172 for (
int i = 0; i < unicodes.size(); ++i) {
173 int position = code.
length();
175 tprintf(
"Unichar %d=%s is too long to encode!!\n", u,
179 int uni = unicodes[i];
186 if (direct_set.
size() >
189 tprintf(
"Code space expanded from original unicharset!!\n");
195 encoder_.push_back(code);
202 for (
int u = 0; u < unicharset.
size(); ++u) {
204 if (code->
length() <= i)
continue;
205 max_offset = std::max(max_offset, (*code)(i)-han_offset);
206 code->
Set(i, (*code)(i) + code_offset);
208 if (max_offset == 0)
break;
209 code_offset += max_offset + 1;
211 DefragmentCodeValues(null_id >= 0 ? 1 : -1);
220 for (
int u = 0; u < unicharset.
size(); ++u) {
227 code.
Set(0, unicharset.
size());
242 void UnicharCompress::DefragmentCodeValues(
int encoded_null) {
250 for (
int c = 0; c < encoder_.size(); ++c) {
252 for (
int i = 0; i < code.
length(); ++i) {
253 offsets[code(i)] = 1;
258 for (
int i = 0; i < offsets.
size(); ++i) {
261 if (offsets[i] == 0 || i == encoded_null) {
267 if (encoded_null >= 0) {
270 offsets[encoded_null] = offsets.
size() + offsets.
back() - encoded_null;
273 for (
int c = 0; c < encoder_.size(); ++c) {
274 RecodedCharID* code = &encoder_[c];
275 for (
int i = 0; i < code->length(); ++i) {
276 int value = (*code)(i);
277 code->Set(i, value + offsets[value]);
286 if (unichar_id < 0 || unichar_id >= encoder_.size())
return 0;
287 *code = encoder_[unichar_id];
296 auto it = decoder_.find(code);
297 if (it == decoder_.end())
return INVALID_UNICHAR_ID;
303 return encoder_.SerializeClasses(fp);
308 if (!encoder_.DeSerializeClasses(fp))
return false;
324 for (
int c = 0; c < encoder_.size(); ++c) {
331 for (
int i = 1; i < code.
length(); ++i) {
356 *leading = offset / kNCount;
357 *vowel = (offset % kNCount) /
kTCount;
363 void UnicharCompress::ComputeCodeRange() {
365 for (
int c = 0; c < encoder_.size(); ++c) {
367 for (
int i = 0; i < code.
length(); ++i) {
368 if (code(i) > code_range_) code_range_ = code(i);
375 void UnicharCompress::SetupDecoder() {
378 for (
int c = 0; c < encoder_.size(); ++c) {
379 const RecodedCharID& code = encoder_[c];
381 is_valid_start_[code(0)] =
true;
382 RecodedCharID prefix = code;
383 int len = code.
length() - 1;
384 prefix.Truncate(len);
385 auto final_it = final_codes_.find(prefix);
386 if (final_it == final_codes_.end()) {
389 final_codes_[prefix] = code_list;
391 prefix.Truncate(len);
392 auto next_it = next_codes_.find(prefix);
393 if (next_it == next_codes_.end()) {
396 next_codes_[prefix] = code_list;
400 if (!next_it->second->contains(code(len)))
406 if (!final_it->second->contains(code(len)))
407 final_it->second->push_back(code(len));
413 void UnicharCompress::Cleanup() {
415 is_valid_start_.
clear();
416 for (
auto it = next_codes_.begin(); it != next_codes_.end(); ++it) {
419 for (
auto it = final_codes_.begin(); it != final_codes_.end(); ++it) {
423 final_codes_.clear();
static const int kMaxCodeLen
int DecodeUnichar(const RecodedCharID &code) const
static const int kNumHangul
int EncodeUnichar(int unichar_id, RecodedCharID *code) const
void SetupDirect(const GenericVector< RecodedCharID > &codes)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
void Set(int index, int value)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
UnicharCompress & operator=(const UnicharCompress &src)
static const int kFirstHangul
void split(const char c, GenericVector< STRING > *splited)
STRING GetEncodingAsString(const UNICHARSET &unicharset) const
bool contains_unichar(const char *const unichar_repr) const
void init_to_size(int size, const T &t)
DLLSYM void tprintf(const char *format,...)
std::unordered_map< int, int > RSCounts
void SetupPassThrough(const UNICHARSET &unicharset)
static std::string CleanupString(const char *utf8_str)
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
void add_str_int(const char *str, int number)
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
const char * id_to_unichar(UNICHAR_ID id) const
std::unordered_map< int, std::unique_ptr< std::vector< int > >> RSMap
bool DeSerialize(TFile *fp)
bool has_special_codes() const
bool Serialize(TFile *fp) const
void Set3(int code0, int code1, int code2)