21 #include <sys/types.h> 36 bool WriteFile(
const std::string& output_dir,
const std::string& lang,
39 if (lang.empty())
return true;
40 std::string dirname = output_dir +
"/" + lang;
44 _mkdir(dirname.c_str());
46 mkdir(dirname.c_str(), S_IRWXU | S_IRWXG);
48 std::string filename = dirname +
"/" + lang + suffix;
49 if (writer ==
nullptr)
52 return (*writer)(data, filename.c_str());
58 if (filename.empty())
return STRING();
61 if (reader ==
nullptr)
64 read_result = (*reader)(filename.c_str(), &data);
65 if (read_result)
return STRING(&data[0], data.
size());
66 tprintf(
"Failed to read data from: %s\n", filename.c_str());
79 unicharset_data.
size());
80 return WriteFile(output_dir, lang,
".unicharset", unicharset_data, writer);
86 const std::string& output_dir,
const std::string& lang,
103 tprintf(
"Null char=%d\n", null_char);
104 if (!recoder.
ComputeEncoding(unicharset, null_char, radical_table_data)) {
105 tprintf(
"Creation of encoded unicharset failed!!\n");
112 if (!recoder.
Serialize(&fp))
return false;
114 recoder_data.
size());
117 memcpy(&recoder_data[0], &encoding[0], encoding.
length());
121 return WriteFile(output_dir, lang, suffix.string(), recoder_data, writer);
132 trie.add_word_list(words, unicharset, reverse_policy);
133 tprintf(
"Reducing Trie to SquishedDawg\n");
134 std::unique_ptr<SquishedDawg> dawg(trie.trie_to_dawg());
135 if (dawg ==
nullptr || dawg->NumEdges() == 0)
return false;
138 fp.OpenWrite(&dawg_data);
139 if (!dawg->write_squished_dawg(&fp))
return false;
140 traineddata->OverwriteEntry(file_type, &dawg_data[0], dawg_data.
size());
151 TessdataManager* traineddata) {
153 tprintf(
"Must have non-empty puncs list to use language models!!\n");
162 if (!words.
empty() &&
175 if (!numbers.
empty() &&
186 const std::string& version_str,
const std::string& output_dir,
187 const std::string& lang,
bool pass_through_recoder,
194 if (!version_str.empty()) {
199 if (!
WriteUnicharset(unicharset, output_dir, lang, writer, &traineddata)) {
200 tprintf(
"Error writing unicharset!!\n");
203 tprintf(
"Config file is optional, continuing...\n");
206 std::string config_filename = script_dir +
"/" + lang +
"/" + lang +
".config";
208 if (config_file.
length() > 0) {
212 std::string radical_filename = script_dir +
"/radical-stroke.txt";
214 if (radical_data.
length() == 0) {
215 tprintf(
"Error reading radical code table %s\n", radical_filename.c_str());
218 if (!
WriteRecoder(unicharset, pass_through_recoder, output_dir, lang, writer,
219 &radical_data, &traineddata)) {
220 tprintf(
"Error writing recoder!!\n");
223 if (!WriteDawgs(words, puncs, numbers, lang_is_rtl, unicharset,
225 tprintf(
"Error during conversion of wordlists to DAWGs!!\n");
232 traineddata.
Serialize(&traineddata_data);
233 if (!
WriteFile(output_dir, lang,
".traineddata", traineddata_data, writer)) {
234 tprintf(
"Error writing output traineddata file!!\n");
void SetVersionString(const std::string &v_str)
void OpenWrite(GenericVector< char > *data)
bool save_to_file(const char *const filename) const
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)
void OverwriteEntry(TessdataType type, const char *data, int size)
int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir, const std::string &version_str, const std::string &output_dir, const std::string &lang, bool pass_through_recoder, const GenericVector< STRING > &words, const GenericVector< STRING > &puncs, const GenericVector< STRING > &numbers, bool lang_is_rtl, FileReader reader, FileWriter writer)
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
STRING GetEncodingAsString(const UNICHARSET &unicharset) const
void init_to_size(int size, const T &t)
DLLSYM void tprintf(const char *format,...)
bool(* FileReader)(const STRING &filename, GenericVector< char > *data)
void SetupPassThrough(const UNICHARSET &unicharset)
bool WriteFile(const std::string &output_dir, const std::string &lang, const std::string &suffix, const GenericVector< char > &data, FileWriter writer)
void add_str_int(const char *str, int number)
bool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir, const std::string &lang, FileWriter writer, TessdataManager *traineddata)
STRING ReadFile(const std::string &filename, FileReader reader)
void Serialize(GenericVector< char > *data) const
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
std::string VersionString() const
bool has_special_codes() const
bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir, const std::string &lang, FileWriter writer, STRING *radical_table_data, TessdataManager *traineddata)
bool Serialize(TFile *fp) const
bool(* FileWriter)(const GenericVector< char > &data, const STRING &filename)