tesseract
5.0.0-alpha-619-ge9db
|
Go to the documentation of this file.
36 "Normalization mode: 1=Combine graphemes, "
37 "2=Split graphemes, 3=Pure unicode");
45 for (
int i = 0; i < strings.
size(); ++i) {
46 std::vector<std::string> normalized;
48 static_cast<GraphemeNormMode>(norm_mode),
50 strings[i].c_str(), &normalized)) {
58 tprintf(
"Normalization failed for string '%s'\n", strings[i].c_str());
63 static int Main(
int argc,
char** argv) {
66 for (
int arg = 1; arg < argc; ++arg) {
68 if (file_data.
length() == 0)
continue;
72 &texts,
nullptr,
nullptr)) {
73 tprintf(
"Extracting unicharset from box file %s\n", argv[arg]);
75 tprintf(
"Extracting unicharset from plain text file %s\n", argv[arg]);
77 file_data.
split(
'\n', &texts);
79 AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);
84 if (unicharset.
save_to_file(FLAGS_output_unicharset.c_str())) {
85 tprintf(
"Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str());
87 tprintf(
"Cannot save unicharset file %s\n",
88 FLAGS_output_unicharset.c_str());
96 int main(
int argc,
char** argv) {
97 tesseract::CheckSharedLibraryVersion();
103 "Usage: %s [--output_unicharset filename] [--norm_mode mode]"
104 " box_or_text_file [...]\n",
106 tprintf(
"Where mode means:\n");
107 tprintf(
" 1=combine graphemes (use for Latin and other simple scripts)\n");
108 tprintf(
" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");
109 tprintf(
" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");
110 tprintf(
"Reads box or plain text files to extract the unicharset.\n");
113 return tesseract::Main(argc, argv);
#define INT_PARAM_FLAG(name, val, comment)
#define STRING_PARAM_FLAG(name, val, comment)
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
bool save_to_file(const char *const filename) const
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
bool IsUTF8Whitespace(const char *text)
STRING ReadFile(const std::string &filename, FileReader reader)
DLLSYM void tprintf(const char *format,...)
int main(int argc, char **argv)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
void split(char c, GenericVector< STRING > *splited)