33 #include "unicode/uchar.h"
34 #include "unicode/uscript.h"
41 for (
int unichar_id = 0; unichar_id < unicharset->
size(); ++unichar_id) {
43 const char* unichar_str = unicharset->
id_to_unichar(unichar_id);
57 bool unichar_isalpha =
false;
58 bool unichar_islower =
false;
59 bool unichar_isupper =
false;
60 bool unichar_isdigit =
false;
61 bool unichar_ispunct =
false;
63 for (
int i = 0; i < uni_vector.
size(); ++i) {
64 if (u_isalpha(uni_vector[i]))
65 unichar_isalpha =
true;
66 if (u_islower(uni_vector[i]))
67 unichar_islower =
true;
68 if (u_isupper(uni_vector[i]))
69 unichar_isupper =
true;
70 if (u_isdigit(uni_vector[i]))
71 unichar_isdigit =
true;
72 if (u_ispunct(uni_vector[i]))
73 unichar_ispunct =
true;
76 unicharset->
set_isalpha(unichar_id, unichar_isalpha);
77 unicharset->
set_islower(unichar_id, unichar_islower);
78 unicharset->
set_isupper(unichar_id, unichar_isupper);
79 unicharset->
set_isdigit(unichar_id, unichar_isdigit);
83 unicharset->
set_script(unichar_id, uscript_getName(
84 uscript_getScript(uni_vector[0], err)));
86 const int num_code_points = uni_vector.
size();
89 if (unichar_islower || unichar_isupper) {
91 for (
int i = 0; i < num_code_points; ++i) {
96 other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
97 u_tolower(uni_vector[i]);
103 if (other_case_id != INVALID_UNICHAR_ID) {
106 tprintf(
"Other case %s of %s is not in unicharset\n",
107 other_case_uch.
c_str(), unichar_str);
113 for (
int i = 0; i < num_code_points; ++i) {
114 mirrors[i] = u_charMirror(uni_vector[i]);
117 static_cast<UNICHARSET::Direction>(
118 u_charDirection(uni_vector[i])));
124 if (mirror_uch_id != INVALID_UNICHAR_ID) {
125 unicharset->
set_mirror(unichar_id, mirror_uch_id);
126 }
else if (report_errors) {
127 tprintf(
"Mirror %s of %s is not in unicharset\n",
128 mirror_uch.
c_str(), unichar_str);
133 if (unichar_id != 0 && normed_str.
length() > 0) {
136 unicharset->
set_normed(unichar_id, unichar_str);
149 const string& input_unicharset_file,
150 const string& output_unicharset_file,
151 const string& output_xheights_file) {
156 tprintf(
"Loaded unicharset of size %d from file %s\n", unicharset.
size(),
157 input_unicharset_file.c_str());
160 tprintf(
"Setting unichar properties\n");
165 string filename = script_dir +
"/" +
174 string script_heights;
176 xheights_str += script_heights;
178 if (!output_xheights_file.empty())
182 tprintf(
"Warning: properties incomplete for index %d = %s\n",
188 tprintf(
"Writing unicharset to file %s\n", output_unicharset_file.c_str());
189 unicharset.
save_to_file(output_unicharset_file.c_str());
void set_isupper(UNICHAR_ID unichar_id, bool value)
void UTF8ToUTF32(const char *utf8_str, GenericVector< char32 > *str32)
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
static bool ReadFileToString(const string &filename, string *out)
bool save_to_file(const char *const filename) const
void set_islower(UNICHAR_ID unichar_id, bool value)
void SetupBasicProperties(bool report_errors, UNICHARSET *unicharset)
bool load_from_file(const char *const filename, bool skip_fragments)
int get_script_table_size() const
static void WriteStringToFileOrDie(const string &str, const string &filename)
void set_normed(UNICHAR_ID unichar_id, const char *normed)
const char *const id_to_unichar(UNICHAR_ID id) const
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
void set_isdigit(UNICHAR_ID unichar_id, bool value)
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
void set_isalpha(UNICHAR_ID unichar_id, bool value)
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
const char * get_script_from_script_id(int id) const
STRING NormalizeUTF8String(const char *str8)
void UTF32ToUTF8(const GenericVector< char32 > &str32, STRING *utf8_str)
void set_script(UNICHAR_ID unichar_id, const char *value)
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
void SetPropertiesFromOther(const UNICHARSET &src)
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
static const char * kCustomLigatures[][2]
void SetPropertiesForInputFile(const string &script_dir, const string &input_unicharset_file, const string &output_unicharset_file, const string &output_xheights_file)
const char * c_str() const