tesseract  4.0.0-1-g2a2b
unicharset_extractor.cpp
Go to the documentation of this file.
1 // File: unicharset_extractor.cpp
3 // Description: Unicode character/ligature set extractor.
4 // Author: Thomas Kielbus
5 // Created: Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 // Given a list of box files or text files on the command line, this program
21 // normalizes the text according to command-line options and generates
22 // a unicharset.
23 
24 #include <cstdlib>
25 #include "boxread.h"
26 #include "commandlineflags.h"
27 #include "commontraining.h" // CheckSharedLibraryVersion
28 #include "genericvector.h"
29 #include "lang_model_helpers.h"
30 #include "normstrngs.h"
31 #include "strngs.h"
32 #include "unicharset.h"
34 
35 STRING_PARAM_FLAG(output_unicharset, "unicharset", "Output file path");
36 INT_PARAM_FLAG(norm_mode, 1,
37  "Normalization mode: 1=Combine graphemes, "
38  "2=Split graphemes, 3=Pure unicode");
39 
40 namespace tesseract {
41 
42 // Helper normalizes and segments the given strings according to norm_mode, and
43 // adds the segmented parts to unicharset.
44 static void AddStringsToUnicharset(const GenericVector<STRING>& strings,
45  int norm_mode, UNICHARSET* unicharset) {
46  for (int i = 0; i < strings.size(); ++i) {
47  std::vector<std::string> normalized;
49  static_cast<GraphemeNormMode>(norm_mode),
50  /*report_errors*/ true,
51  strings[i].string(), &normalized)) {
52  for (const std::string& normed : normalized) {
53 
54  // normed is a UTF-8 encoded string
55  if (normed.empty() || IsUTF8Whitespace(normed.c_str())) continue;
56  unicharset->unichar_insert(normed.c_str());
57  }
58  } else {
59  tprintf("Normalization failed for string '%s'\n", strings[i].c_str());
60  }
61  }
62 }
63 
64 static int Main(int argc, char** argv) {
65  UNICHARSET unicharset;
66  // Load input files
67  for (int arg = 1; arg < argc; ++arg) {
68  STRING file_data = tesseract::ReadFile(argv[arg], /*reader*/ nullptr);
69  if (file_data.length() == 0) continue;
71  if (ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],
72  /*continue_on_failure*/ false, /*boxes*/ nullptr,
73  &texts, /*box_texts*/ nullptr, /*pages*/ nullptr)) {
74  tprintf("Extracting unicharset from box file %s\n", argv[arg]);
75  } else {
76  tprintf("Extracting unicharset from plain text file %s\n", argv[arg]);
77  texts.truncate(0);
78  file_data.split('\n', &texts);
79  }
80  AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);
81  }
82  SetupBasicProperties(/*report_errors*/ true, /*decompose*/ false,
83  &unicharset);
84  // Write unicharset file.
85  if (unicharset.save_to_file(FLAGS_output_unicharset.c_str())) {
86  tprintf("Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str());
87  } else {
88  tprintf("Cannot save unicharset file %s\n",
89  FLAGS_output_unicharset.c_str());
90  return EXIT_FAILURE;
91  }
92  return EXIT_SUCCESS;
93 }
94 
95 } // namespace tesseract
96 
97 int main(int argc, char** argv) {
98  tesseract::CheckSharedLibraryVersion();
99  if (argc > 1) {
100  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
101  }
102  if (argc < 2) {
103  tprintf(
104  "Usage: %s [--output_unicharset filename] [--norm_mode mode]"
105  " box_or_text_file [...]\n",
106  argv[0]);
107  tprintf("Where mode means:\n");
108  tprintf(" 1=combine graphemes (use for Latin and other simple scripts)\n");
109  tprintf(" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");
110  tprintf(" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");
111  tprintf("Reads box or plain text files to extract the unicharset.\n");
112  return EXIT_FAILURE;
113  }
114  return tesseract::Main(argc, argv);
115 }
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
int size() const
Definition: genericvector.h:71
bool save_to_file(const char *const filename) const
Definition: unicharset.h:345
INT_PARAM_FLAG(norm_mode, 1, "Normalization mode: 1=Combine graphemes, " "2=Split graphemes, 3=Pure unicode")
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:229
STRING_PARAM_FLAG(output_unicharset, "unicharset", "Output file path")
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:625
void split(const char c, GenericVector< STRING > *splited)
Definition: strngs.cpp:284
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: strngs.h:45
STRING ReadFile(const std::string &filename, FileReader reader)
int main(int argc, char **argv)
void truncate(int size)
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:172
int32_t length() const
Definition: strngs.cpp:191
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:68