33 #include "unicode/uchar.h" 34 #include "unicode/uscript.h" 42 for (
int unichar_id = 0; unichar_id < unicharset->
size(); ++unichar_id) {
44 const char* unichar_str = unicharset->
id_to_unichar(unichar_id);
57 bool unichar_isalpha =
false;
58 bool unichar_islower =
false;
59 bool unichar_isupper =
false;
60 bool unichar_isdigit =
false;
61 bool unichar_ispunct =
false;
63 for (
char32 u_ch : uni_vector) {
64 if (u_isalpha(u_ch)) unichar_isalpha =
true;
65 if (u_islower(u_ch)) unichar_islower =
true;
66 if (u_isupper(u_ch)) unichar_isupper =
true;
67 if (u_isdigit(u_ch)) unichar_isdigit =
true;
68 if (u_ispunct(u_ch)) unichar_ispunct =
true;
71 unicharset->
set_isalpha(unichar_id, unichar_isalpha);
72 unicharset->
set_islower(unichar_id, unichar_islower);
73 unicharset->
set_isupper(unichar_id, unichar_isupper);
74 unicharset->
set_isdigit(unichar_id, unichar_isdigit);
78 unicharset->
set_script(unichar_id, uscript_getName(
79 uscript_getScript(uni_vector[0], err)));
81 const int num_code_points = uni_vector.size();
84 if (unichar_islower || unichar_isupper) {
85 std::vector<char32> other_case(num_code_points, 0);
86 for (
int i = 0; i < num_code_points; ++i) {
91 other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
92 u_tolower(uni_vector[i]);
97 if (other_case_id != INVALID_UNICHAR_ID) {
100 tprintf(
"Other case %s of %s is not in unicharset\n",
101 other_case_uch.c_str(), unichar_str);
106 std::vector<char32> mirrors(num_code_points, 0);
107 for (
int i = 0; i < num_code_points; ++i) {
108 mirrors[i] = u_charMirror(uni_vector[i]);
111 static_cast<UNICHARSET::Direction>(
112 u_charDirection(uni_vector[i])));
117 if (mirror_uch_id != INVALID_UNICHAR_ID) {
118 unicharset->
set_mirror(unichar_id, mirror_uch_id);
119 }
else if (report_errors) {
120 tprintf(
"Mirror %s of %s is not in unicharset\n",
121 mirror_uch.c_str(), unichar_str);
125 std::string normed_str;
126 if (unichar_id != 0 &&
131 unichar_str, &normed_str) &&
132 !normed_str.empty()) {
133 unicharset->
set_normed(unichar_id, normed_str.c_str());
135 unicharset->
set_normed(unichar_id, unichar_str);
146 std::string filename = script_dir +
"/" +
152 tprintf(
"Failed to load script unicharset from:%s\n", filename.c_str());
157 tprintf(
"Warning: properties incomplete for index %d = %s\n", c,
166 std::string xheights_str;
169 std::string filename = script_dir +
"/" +
171 std::string script_heights;
173 xheights_str += script_heights;
184 const std::string& input_unicharset_file,
185 const std::string& output_unicharset_file,
186 const std::string& output_xheights_file) {
191 tprintf(
"Loaded unicharset of size %d from file %s\n", unicharset.
size(),
192 input_unicharset_file.c_str());
195 tprintf(
"Setting unichar properties\n");
197 tprintf(
"Setting script properties\n");
199 if (!output_xheights_file.empty()) {
205 tprintf(
"Writing unicharset to file %s\n", output_unicharset_file.c_str());
206 unicharset.
save_to_file(output_unicharset_file.c_str());
void set_isalpha(UNICHAR_ID unichar_id, bool value)
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
std::string GetXheightString(const std::string &script_dir, const UNICHARSET &unicharset)
bool save_to_file(const char *const filename) const
const char * get_script_from_script_id(int id) const
static TESS_API const char * kCustomLigatures[][2]
static bool ReadFileToString(const std::string &filename, std::string *out)
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
int get_script_table_size() const
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
void SetPropertiesFromOther(const UNICHARSET &src)
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
static void WriteStringToFileOrDie(const std::string &str, const std::string &filename)
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
DLLSYM void tprintf(const char *format,...)
void set_isupper(UNICHAR_ID unichar_id, bool value)
void SetPropertiesForInputFile(const std::string &script_dir, const std::string &input_unicharset_file, const std::string &output_unicharset_file, const std::string &output_xheights_file)
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
void set_normed(UNICHAR_ID unichar_id, const char *normed)
void set_islower(UNICHAR_ID unichar_id, bool value)
const char * id_to_unichar(UNICHAR_ID id) const
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
void set_isdigit(UNICHAR_ID unichar_id, bool value)
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
void SetScriptProperties(const std::string &script_dir, UNICHARSET *unicharset)
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
bool load_from_file(const char *const filename, bool skip_fragments)
void set_script(UNICHAR_ID unichar_id, const char *value)