tesseract  5.0.0-alpha-619-ge9db
lang_model_helpers.cpp
Go to the documentation of this file.
1 // Copyright 2017 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 // Purpose: Collection of convenience functions to simplify creation of the
4 // unicharset, recoder, and dawgs for an LSTM model.
5 
6 // Licensed under the Apache License, Version 2.0 (the "License");
7 // you may not use this file except in compliance with the License.
8 // You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 #include "lang_model_helpers.h"
16 
17 #if defined(_WIN32)
18 #include <direct.h>
19 #endif
20 #include <sys/stat.h>
21 #include <sys/types.h>
22 #include <cstdlib>
23 #include "dawg.h"
24 #include "fileio.h"
25 #include "tessdatamanager.h"
26 #include "trie.h"
27 #include "unicharcompress.h"
28 
29 namespace tesseract {
30 
31 // Helper makes a filename (<output_dir>/<lang>/<lang><suffix>) and writes data
32 // to the file, using writer if not null, otherwise, a default writer.
33 // Default writer will overwrite any existing file, but a supplied writer
34 // can do its own thing. If lang is empty, returns true but does nothing.
35 // NOTE that suffix should contain any required . for the filename.
36 bool WriteFile(const std::string& output_dir, const std::string& lang,
37  const std::string& suffix, const GenericVector<char>& data,
38  FileWriter writer) {
39  if (lang.empty()) return true;
40  std::string dirname = output_dir + "/" + lang;
41  // Attempt to make the directory, but ignore errors, as it may not be a
42  // standard filesystem, and the writer will complain if not successful.
43 #if defined(_WIN32)
44  _mkdir(dirname.c_str());
45 #else
46  mkdir(dirname.c_str(), S_IRWXU | S_IRWXG);
47 #endif
48  std::string filename = dirname + "/" + lang + suffix;
49  if (writer == nullptr)
50  return SaveDataToFile(data, filename.c_str());
51  else
52  return (*writer)(data, filename.c_str());
53 }
54 
55 // Helper reads a file with optional reader and returns a STRING.
56 // On failure emits a warning message and returns and empty STRING.
57 STRING ReadFile(const std::string& filename, FileReader reader) {
58  if (filename.empty()) return STRING();
60  bool read_result;
61  if (reader == nullptr)
62  read_result = LoadDataFromFile(filename.c_str(), &data);
63  else
64  read_result = (*reader)(filename.c_str(), &data);
65  if (read_result) return STRING(&data[0], data.size());
66  tprintf("Failed to read data from: %s\n", filename.c_str());
67  return STRING();
68 }
69 
70 // Helper writes the unicharset to file and to the traineddata.
71 bool WriteUnicharset(const UNICHARSET& unicharset, const std::string& output_dir,
72  const std::string& lang, FileWriter writer,
73  TessdataManager* traineddata) {
74  GenericVector<char> unicharset_data;
75  TFile fp;
76  fp.OpenWrite(&unicharset_data);
77  if (!unicharset.save_to_file(&fp)) return false;
78  traineddata->OverwriteEntry(TESSDATA_LSTM_UNICHARSET, &unicharset_data[0],
79  unicharset_data.size());
80  return WriteFile(output_dir, lang, ".unicharset", unicharset_data, writer);
81 }
82 
83 // Helper creates the recoder and writes it to the traineddata, and a human-
84 // readable form to file.
85 bool WriteRecoder(const UNICHARSET& unicharset, bool pass_through,
86  const std::string& output_dir, const std::string& lang,
87  FileWriter writer, STRING* radical_table_data,
88  TessdataManager* traineddata) {
89  UnicharCompress recoder;
90  // Where the unicharset is carefully setup already to contain a good
91  // compact encoding, use a pass-through recoder that does nothing.
92  // For scripts that have a large number of unicodes (Han, Hangul) we want
93  // to use the recoder to compress the symbol space by re-encoding each
94  // unicode as multiple codes from a smaller 'alphabet' that are related to the
95  // shapes in the character. Hangul Jamo is a perfect example of this.
96  // See the Hangul Syllables section, sub-section "Equivalence" in:
97  // http://www.unicode.org/versions/Unicode10.0.0/ch18.pdf
98  if (pass_through) {
99  recoder.SetupPassThrough(unicharset);
100  } else {
101  int null_char =
102  unicharset.has_special_codes() ? UNICHAR_BROKEN : unicharset.size();
103  tprintf("Null char=%d\n", null_char);
104  if (!recoder.ComputeEncoding(unicharset, null_char, radical_table_data)) {
105  tprintf("Creation of encoded unicharset failed!!\n");
106  return false;
107  }
108  }
109  TFile fp;
110  GenericVector<char> recoder_data;
111  fp.OpenWrite(&recoder_data);
112  if (!recoder.Serialize(&fp)) return false;
113  traineddata->OverwriteEntry(TESSDATA_LSTM_RECODER, &recoder_data[0],
114  recoder_data.size());
115  STRING encoding = recoder.GetEncodingAsString(unicharset);
116  recoder_data.init_to_size(encoding.length(), 0);
117  memcpy(&recoder_data[0], &encoding[0], encoding.length());
118  STRING suffix;
119  suffix.add_str_int(".charset_size=", recoder.code_range());
120  suffix += ".txt";
121  return WriteFile(output_dir, lang, suffix.c_str(), recoder_data, writer);
122 }
123 
124 // Helper builds a dawg from the given words, using the unicharset as coding,
125 // and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata.
126 static bool WriteDawg(const GenericVector<STRING>& words,
127  const UNICHARSET& unicharset,
128  Trie::RTLReversePolicy reverse_policy,
129  TessdataType file_type, TessdataManager* traineddata) {
130  // The first 3 arguments are not used in this case.
131  Trie trie(DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM, unicharset.size(), 0);
132  trie.add_word_list(words, unicharset, reverse_policy);
133  tprintf("Reducing Trie to SquishedDawg\n");
134  std::unique_ptr<SquishedDawg> dawg(trie.trie_to_dawg());
135  if (dawg == nullptr || dawg->NumEdges() == 0) return false;
136  TFile fp;
137  GenericVector<char> dawg_data;
138  fp.OpenWrite(&dawg_data);
139  if (!dawg->write_squished_dawg(&fp)) return false;
140  traineddata->OverwriteEntry(file_type, &dawg_data[0], dawg_data.size());
141  return true;
142 }
143 
144 // Builds and writes the dawgs, given a set of words, punctuation
145 // patterns, number patterns, to the traineddata. Encoding uses the given
146 // unicharset, and the punc dawgs is reversed if lang_is_rtl.
147 static bool WriteDawgs(const GenericVector<STRING>& words,
148  const GenericVector<STRING>& puncs,
149  const GenericVector<STRING>& numbers, bool lang_is_rtl,
150  const UNICHARSET& unicharset,
151  TessdataManager* traineddata) {
152  if (puncs.empty()) {
153  tprintf("Must have non-empty puncs list to use language models!!\n");
154  return false;
155  }
156  // For each of the dawg types, make the dawg, and write to traineddata.
157  // Dawgs are reversed as follows:
158  // Words: According to the word content.
159  // Puncs: According to lang_is_rtl.
160  // Numbers: Never.
161  // System dawg (main wordlist).
162  if (!words.empty() &&
163  !WriteDawg(words, unicharset, Trie::RRP_REVERSE_IF_HAS_RTL,
164  TESSDATA_LSTM_SYSTEM_DAWG, traineddata)) {
165  return false;
166  }
167  // punc/punc-dawg.
168  Trie::RTLReversePolicy reverse_policy =
170  if (!WriteDawg(puncs, unicharset, reverse_policy, TESSDATA_LSTM_PUNC_DAWG,
171  traineddata)) {
172  return false;
173  }
174  // numbers/number-dawg.
175  if (!numbers.empty() &&
176  !WriteDawg(numbers, unicharset, Trie::RRP_DO_NO_REVERSE,
177  TESSDATA_LSTM_NUMBER_DAWG, traineddata)) {
178  return false;
179  }
180  return true;
181 }
182 
183 // The main function for combine_lang_model.cpp.
184 // Returns EXIT_SUCCESS or EXIT_FAILURE for error.
185 int CombineLangModel(const UNICHARSET& unicharset, const std::string& script_dir,
186  const std::string& version_str, const std::string& output_dir,
187  const std::string& lang, bool pass_through_recoder,
188  const GenericVector<STRING>& words,
189  const GenericVector<STRING>& puncs,
190  const GenericVector<STRING>& numbers, bool lang_is_rtl,
191  FileReader reader, FileWriter writer) {
192  // Build the traineddata file.
193  TessdataManager traineddata;
194  if (!version_str.empty()) {
195  traineddata.SetVersionString(traineddata.VersionString() + ":" +
196  version_str);
197  }
198  // Unicharset and recoder.
199  if (!WriteUnicharset(unicharset, output_dir, lang, writer, &traineddata)) {
200  tprintf("Error writing unicharset!!\n");
201  return EXIT_FAILURE;
202  } else {
203  tprintf("Config file is optional, continuing...\n");
204  }
205  // If there is a config file, read it and add to traineddata.
206  std::string config_filename = script_dir + "/" + lang + "/" + lang + ".config";
207  STRING config_file = ReadFile(config_filename, reader);
208  if (config_file.length() > 0) {
209  traineddata.OverwriteEntry(TESSDATA_LANG_CONFIG, &config_file[0],
210  config_file.length());
211  }
212  std::string radical_filename = script_dir + "/radical-stroke.txt";
213  STRING radical_data = ReadFile(radical_filename, reader);
214  if (radical_data.length() == 0) {
215  tprintf("Error reading radical code table %s\n", radical_filename.c_str());
216  return EXIT_FAILURE;
217  }
218  if (!WriteRecoder(unicharset, pass_through_recoder, output_dir, lang, writer,
219  &radical_data, &traineddata)) {
220  tprintf("Error writing recoder!!\n");
221  }
222  if (!words.empty() || !puncs.empty() || !numbers.empty()) {
223  if (!WriteDawgs(words, puncs, numbers, lang_is_rtl, unicharset,
224  &traineddata)) {
225  tprintf("Error during conversion of wordlists to DAWGs!!\n");
226  return EXIT_FAILURE;
227  }
228  }
229 
230  // Traineddata file.
231  GenericVector<char> traineddata_data;
232  traineddata.Serialize(&traineddata_data);
233  if (!WriteFile(output_dir, lang, ".traineddata", traineddata_data, writer)) {
234  tprintf("Error writing output traineddata file!!\n");
235  return EXIT_FAILURE;
236  }
237  return EXIT_SUCCESS;
238 }
239 
240 } // namespace tesseract
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::WriteUnicharset
bool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir, const std::string &lang, FileWriter writer, TessdataManager *traineddata)
Definition: lang_model_helpers.cpp:71
tesseract::FileWriter
bool(*)(const GenericVector< char > &data, const char *filename) FileWriter
Definition: serialis.h:51
tesseract::CombineLangModel
int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir, const std::string &version_str, const std::string &output_dir, const std::string &lang, bool pass_through_recoder, const GenericVector< STRING > &words, const GenericVector< STRING > &puncs, const GenericVector< STRING > &numbers, bool lang_is_rtl, FileReader reader, FileWriter writer)
Definition: lang_model_helpers.cpp:185
STRING::add_str_int
void add_str_int(const char *str, int number)
Definition: strngs.cpp:370
tesseract::TessdataManager::SetVersionString
void SetVersionString(const std::string &v_str)
Definition: tessdatamanager.cpp:239
tesseract::TessdataManager
Definition: tessdatamanager.h:126
tesseract::WriteRecoder
bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir, const std::string &lang, FileWriter writer, STRING *radical_table_data, TessdataManager *traineddata)
Definition: lang_model_helpers.cpp:85
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
tesseract::LoadDataFromFile
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
Definition: genericvector.h:341
tesseract::TessdataManager::VersionString
std::string VersionString() const
Definition: tessdatamanager.cpp:233
STRING
Definition: strngs.h:45
tesseract::UnicharCompress::code_range
int code_range() const
Definition: unicharcompress.h:161
tesseract::FileReader
bool(*)(const char *filename, GenericVector< char > *data) FileReader
Definition: serialis.h:47
tesseract::SaveDataToFile
bool SaveDataToFile(const GenericVector< char > &data, const char *filename)
Definition: genericvector.h:362
tesseract::TESSDATA_LSTM_SYSTEM_DAWG
Definition: tessdatamanager.h:76
tesseract::DAWG_TYPE_WORD
Definition: dawg.h:68
fileio.h
UNICHARSET::save_to_file
bool save_to_file(const char *const filename) const
Definition: unicharset.h:350
lang_model_helpers.h
tesseract::TessdataManager::OverwriteEntry
void OverwriteEntry(TessdataType type, const char *data, int size)
Definition: tessdatamanager.cpp:145
UNICHAR_BROKEN
Definition: unicharset.h:36
dawg.h
tesseract::Trie::RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:58
tesseract::Trie::RRP_DO_NO_REVERSE
Definition: trie.h:57
tesseract::TESSDATA_LSTM_NUMBER_DAWG
Definition: tessdatamanager.h:77
tesseract::TESSDATA_LSTM_RECODER
Definition: tessdatamanager.h:79
tesseract::TFile
Definition: serialis.h:75
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
UNICHARSET
Definition: unicharset.h:145
tesseract::Trie::RTLReversePolicy
RTLReversePolicy
Definition: trie.h:56
tesseract
Definition: baseapi.h:65
tesseract::TessdataType
TessdataType
Definition: tessdatamanager.h:56
UNICHARSET::has_special_codes
bool has_special_codes() const
Definition: unicharset.h:712
GenericVector< char >
tesseract::TESSDATA_LANG_CONFIG
Definition: tessdatamanager.h:57
tesseract::TESSDATA_LSTM_PUNC_DAWG
Definition: tessdatamanager.h:75
tesseract::ReadFile
STRING ReadFile(const std::string &filename, FileReader reader)
Definition: lang_model_helpers.cpp:57
STRING::length
int32_t length() const
Definition: strngs.cpp:187
tesseract::WriteFile
bool WriteFile(const std::string &output_dir, const std::string &lang, const std::string &suffix, const GenericVector< char > &data, FileWriter writer)
Definition: lang_model_helpers.cpp:36
tesseract::UnicharCompress::ComputeEncoding
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
Definition: unicharcompress.cpp:101
unicharcompress.h
tesseract::UnicharCompress::SetupPassThrough
void SetupPassThrough(const UNICHARSET &unicharset)
Definition: unicharcompress.cpp:216
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::UnicharCompress::GetEncodingAsString
STRING GetEncodingAsString(const UNICHARSET &unicharset) const
Definition: unicharcompress.cpp:319
tesseract::Trie::RRP_FORCE_REVERSE
Definition: trie.h:59
tesseract::TessdataManager::Serialize
void Serialize(GenericVector< char > *data) const
Definition: tessdatamanager.cpp:166
tesseract::UnicharCompress
Definition: unicharcompress.h:128
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::TFile::OpenWrite
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:309
tesseract::TESSDATA_LSTM_UNICHARSET
Definition: tessdatamanager.h:78
tessdatamanager.h
UNICHARSET::size
int size() const
Definition: unicharset.h:341
trie.h
tesseract::UnicharCompress::Serialize
bool Serialize(TFile *fp) const
Definition: unicharcompress.cpp:300