29 static const char* kNullChar =
"<nul>";
36 static int RadicalPreHash(
const std::vector<int>& rs) {
38 for (
int radical : rs) {
46 using RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>;
50 static bool DecodeRadicalLine(
STRING* radical_data_line,
RSMap* radical_map) {
51 if (radical_data_line->
length() == 0 || (*radical_data_line)[0] ==
'#')
54 radical_data_line->
split(
' ', &entries);
55 if (entries.
size() < 2)
return false;
57 int unicode = strtol(&entries[0][0], &end, 10);
58 if (*end !=
'\0')
return false;
59 std::unique_ptr<std::vector<int>> radicals(
new std::vector<int>);
60 for (
int i = 1; i < entries.
size(); ++i) {
61 int radical = strtol(&entries[i][0], &end, 10);
62 if (*end !=
'\0')
return false;
63 radicals->push_back(radical);
65 (*radical_map)[unicode] = std::move(radicals);
73 static bool DecodeRadicalTable(
STRING* radical_data,
RSMap* radical_map) {
75 radical_data->
split(
'\n', &lines);
76 for (
int i = 0; i < lines.
size(); ++i) {
77 if (!DecodeRadicalLine(&lines[i], radical_map)) {
78 tprintf(
"Invalid format in radical table at line %d: %s\n", i,
91 encoder_ = src.encoder_;
92 code_range_ = src.code_range_;
102 STRING* radical_stroke_table) {
104 if (radical_stroke_table !=
nullptr &&
105 !DecodeRadicalTable(radical_stroke_table, &radical_map))
120 int hangul_offset = unicharset.
size();
126 int han_offset = hangul_offset + kTotalJamos;
127 for (
int u = 0; u <= unicharset.
size(); ++u) {
130 if (u == unicharset.
size() && u != null_id)
break;
133 std::vector<char32> unicodes;
135 if (u < unicharset.
size())
137 if (u < unicharset.
size() &&
140 int unicode = unicodes[0];
141 int leading, vowel, trailing;
142 auto it = radical_map.find(unicode);
143 if (it != radical_map.end()) {
145 int num_radicals = it->second->size();
146 for (
int c = 0; c < num_radicals; ++c) {
147 code.
Set(c, han_offset + (*it->second)[c]);
149 int pre_hash = RadicalPreHash(*it->second);
150 int num_samples = radical_counts[pre_hash]++;
156 code.
Set3(leading + hangul_offset, vowel +
kLCount + hangul_offset,
171 for (
int uni : unicodes) {
172 int position = code.
length();
174 tprintf(
"Unichar %d=%s is too long to encode!!\n", u,
184 if (direct_set.
size() >
187 tprintf(
"Code space expanded from original unicharset!!\n");
193 encoder_.push_back(code);
200 for (
int u = 0; u < unicharset.
size(); ++u) {
202 if (code->
length() <= i)
continue;
203 max_offset = std::max(max_offset, (*code)(i)-han_offset);
204 code->
Set(i, (*code)(i) + code_offset);
206 if (max_offset == 0)
break;
207 code_offset += max_offset + 1;
209 DefragmentCodeValues(null_id >= 0 ? 1 : -1);
218 for (
int u = 0; u < unicharset.
size(); ++u) {
225 code.
Set(0, unicharset.
size());
240 void UnicharCompress::DefragmentCodeValues(
int encoded_null) {
248 for (
int c = 0; c < encoder_.size(); ++c) {
250 for (
int i = 0; i < code.
length(); ++i) {
251 offsets[code(i)] = 1;
256 for (
int i = 0; i < offsets.
size(); ++i) {
259 if (offsets[i] == 0 || i == encoded_null) {
265 if (encoded_null >= 0) {
268 offsets[encoded_null] = offsets.
size() + offsets.
back() - encoded_null;
271 for (
int c = 0; c < encoder_.size(); ++c) {
272 RecodedCharID* code = &encoder_[c];
273 for (
int i = 0; i < code->length(); ++i) {
274 int value = (*code)(i);
275 code->Set(i, value + offsets[value]);
284 if (unichar_id < 0 || unichar_id >= encoder_.size())
return 0;
285 *code = encoder_[unichar_id];
294 auto it = decoder_.find(code);
295 if (it == decoder_.end())
return INVALID_UNICHAR_ID;
301 return encoder_.SerializeClasses(fp);
306 if (!encoder_.DeSerializeClasses(fp))
return false;
322 for (
int c = 0; c < encoder_.size(); ++c) {
329 for (
int i = 1; i < code.
length(); ++i) {
335 encoding += kNullChar;
354 *leading = offset / kNCount;
355 *vowel = (offset % kNCount) /
kTCount;
361 void UnicharCompress::ComputeCodeRange() {
363 for (
int c = 0; c < encoder_.size(); ++c) {
365 for (
int i = 0; i < code.
length(); ++i) {
366 if (code(i) > code_range_) code_range_ = code(i);
373 void UnicharCompress::SetupDecoder() {
376 for (
int c = 0; c < encoder_.size(); ++c) {
377 const RecodedCharID& code = encoder_[c];
379 is_valid_start_[code(0)] =
true;
380 RecodedCharID prefix = code;
381 int len = code.length() - 1;
382 prefix.Truncate(len);
383 auto final_it = final_codes_.find(prefix);
384 if (final_it == final_codes_.end()) {
387 final_codes_[prefix] = code_list;
389 prefix.Truncate(len);
390 auto next_it = next_codes_.find(prefix);
391 if (next_it == next_codes_.end()) {
394 next_codes_[prefix] = code_list;
398 if (!next_it->second->contains(code(len)))
399 next_it->second->push_back(code(len));
404 if (!final_it->second->contains(code(len)))
405 final_it->second->push_back(code(len));
411 void UnicharCompress::Cleanup() {
413 is_valid_start_.
clear();
414 for (
auto& next_code : next_codes_) {
415 delete next_code.second;
417 for (
auto& final_code : final_codes_) {
418 delete final_code.second;
421 final_codes_.clear();