37 "Normalization mode: 1=Combine graphemes, " 38 "2=Split graphemes, 3=Pure unicode");
46 for (
int i = 0; i < strings.
size(); ++i) {
47 std::vector<std::string> normalized;
49 static_cast<GraphemeNormMode>(norm_mode),
51 strings[i].
string(), &normalized)) {
52 for (
const std::string& normed : normalized) {
59 tprintf(
"Normalization failed for string '%s'\n", strings[i].c_str());
64 static int Main(
int argc,
char** argv) {
67 for (
int arg = 1; arg < argc; ++arg) {
69 if (file_data.
length() == 0)
continue;
73 &texts,
nullptr,
nullptr)) {
74 tprintf(
"Extracting unicharset from box file %s\n", argv[arg]);
76 tprintf(
"Extracting unicharset from plain text file %s\n", argv[arg]);
78 file_data.
split(
'\n', &texts);
80 AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);
85 if (unicharset.
save_to_file(FLAGS_output_unicharset.c_str())) {
86 tprintf(
"Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str());
88 tprintf(
"Cannot save unicharset file %s\n",
89 FLAGS_output_unicharset.c_str());
97 int main(
int argc,
char** argv) {
98 tesseract::CheckSharedLibraryVersion();
104 "Usage: %s [--output_unicharset filename] [--norm_mode mode]" 105 " box_or_text_file [...]\n",
107 tprintf(
"Where mode means:\n");
108 tprintf(
" 1=combine graphemes (use for Latin and other simple scripts)\n");
109 tprintf(
" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");
110 tprintf(
" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");
111 tprintf(
"Reads box or plain text files to extract the unicharset.\n");
114 return tesseract::Main(argc, argv);
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
bool save_to_file(const char *const filename) const
INT_PARAM_FLAG(norm_mode, 1, "Normalization mode: 1=Combine graphemes, " "2=Split graphemes, 3=Pure unicode")
bool IsUTF8Whitespace(const char *text)
STRING_PARAM_FLAG(output_unicharset, "unicharset", "Output file path")
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
void split(const char c, GenericVector< STRING > *splited)
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
DLLSYM void tprintf(const char *format,...)
STRING ReadFile(const std::string &filename, FileReader reader)
int main(int argc, char **argv)
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)