42 int TessLangModel::max_edge_ = 4096;
45 const Dawg *TessLangModel::ood_dawg_ =
reinterpret_cast<Dawg *
>(
DAWG_OOD);
46 const Dawg *TessLangModel::number_dawg_ =
reinterpret_cast<Dawg *
>(
DAWG_NUMBER);
55 const int TessLangModel::num_max_repeat_[
kStateCnt] = {3, 32, 8, 3};
61 const string &data_file_path,
62 bool load_system_dawg,
68 LoadLangModelElements(lm_params);
72 if (load_system_dawg &&
78 cntxt_->
Lang().c_str(),
87 void TessLangModel::FreeEdges(
int edge_cnt,
LangModEdge **edge_array) {
88 if (edge_array !=
NULL) {
89 for (
int edge_idx = 0; edge_idx < edge_cnt; edge_idx++) {
90 if (edge_array[edge_idx] !=
NULL) {
91 delete edge_array[edge_idx];
105 LangModEdge **final_edge) {
108 LangModEdge **edge_array =
GetEdges(
NULL, edge, &edge_cnt);
111 for (
int edge_idx = 0; edge_idx < edge_cnt; edge_idx++) {
113 if (sequence[0] == edge_array[edge_idx]->EdgeString()[0]) {
115 if (sequence[1] == 0) {
117 if (eow_flag ==
false || edge_array[edge_idx]->IsEOW()) {
118 if (final_edge !=
NULL) {
119 (*final_edge) = edge_array[edge_idx];
120 edge_array[edge_idx] =
NULL;
123 FreeEdges(edge_cnt, edge_array);
129 final_edge) ==
true) {
130 FreeEdges(edge_cnt, edge_array);
137 FreeEdges(edge_cnt, edge_array);
147 if (final_edge !=
NULL) {
148 (*final_edge) =
NULL;
155 return lead_punc_.find(ch) != string::npos;
159 return trail_punc_.find(ch) != string::npos;
163 return digits_.find(ch) != string::npos;
179 if (tess_lm_edge ==
NULL) {
181 int dawg_cnt = NumDawgs();
183 (*edge_cnt) = dawg_cnt * max_edge_;
185 if (edge_array ==
NULL) {
189 for (
int dawg_idx = (*edge_cnt) = 0; dawg_idx < dawg_cnt; dawg_idx++) {
190 const Dawg *curr_dawg = GetDawg(dawg_idx);
194 (*edge_cnt) += FanOut(alt_list, curr_dawg, 0, 0,
NULL,
true,
195 edge_array + (*edge_cnt));
199 (*edge_cnt) += FanOut(alt_list, number_dawg_, 0, 0,
NULL,
true,
200 edge_array + (*edge_cnt));
204 (*edge_cnt) += FanOut(alt_list, ood_dawg_, 0, 0,
NULL,
true,
205 edge_array + (*edge_cnt));
208 for (
int edge_idx = 0; edge_idx < (*edge_cnt); edge_idx++) {
209 edge_array[edge_idx]->
SetRoot(
true);
213 (*edge_cnt) = max_edge_;
216 if (edge_array ==
NULL) {
221 (*edge_cnt) = FanOut(alt_list,
224 tess_lm_edge->
EdgeString(),
false, edge_array);
231 int TessLangModel::Edges(
const char *strng,
const Dawg *dawg,
237 for (edge_idx = 0; strng[edge_idx] != 0; edge_idx++) {
239 if (class_id != INVALID_UNICHAR_ID) {
243 if (edge_array[edge_cnt] ==
NULL) {
247 reinterpret_cast<TessLangModEdge *
>(edge_array[edge_cnt])->
248 SetEdgeMask(edge_mask);
257 int TessLangModel::OODEdges(CharAltList *alt_list,
EDGE_REF edge_ref,
258 EDGE_REF edge_ref_mask, LangModEdge **edge_array) {
261 for (
int class_id = 0; class_id < class_cnt; class_id++) {
263 if ((alt_list ==
NULL ||
264 alt_list->ClassCost(class_id) <= max_ood_shape_cost_)) {
266 edge_array[edge_cnt] =
new TessLangModEdge(cntxt_, class_id);
267 if (edge_array[edge_cnt] ==
NULL) {
279 int TessLangModel::FanOut(CharAltList *alt_list,
const Dawg *dawg,
281 const char_32 *str,
bool root_flag,
282 LangModEdge **edge_array) {
287 if (dawg == reinterpret_cast<Dawg *>(
DAWG_OOD)) {
289 return OODEdges(alt_list, edge_ref, edge_mask, edge_array);
293 }
else if (dawg == reinterpret_cast<Dawg *>(
DAWG_NUMBER)) {
296 return NumberEdges(edge_ref, edge_array);
304 return Edges(trail_punc_.c_str(), dawg, edge_ref,
309 }
else if (root_flag ==
true || edge_ref == 0) {
320 bool eow_flag = (dawg->end_of_word(edge_ref) != 0);
323 if (eow_flag ==
true) {
326 edge_cnt += Edges(trail_punc_.c_str(), dawg, edge_ref,
329 edge_cnt += Edges(
"-/", dawg, 0, 0, edge_array + edge_cnt);
334 next_node = dawg->next_node(edge_ref);
335 if (next_node == 0 || next_node == NO_EDGE) {
345 edge_array + edge_cnt);
346 int strt_cnt = edge_cnt;
349 for (
int child = 0; child < child_edge_cnt; child++) {
350 reinterpret_cast<TessLangModEdge *
>(edge_array[edge_cnt++])->
351 SetEdgeMask(edge_mask);
355 if (root_flag ==
true) {
356 for (
int child = 0; child < child_edge_cnt; child++) {
357 TessLangModEdge *child_edge =
358 reinterpret_cast<TessLangModEdge *
>(edge_array[strt_cnt + child]);
360 if (has_case_ ==
true) {
361 const char_32 *edge_str = child_edge->EdgeString();
362 if (edge_str !=
NULL && islower(edge_str[0]) != 0 &&
366 if (class_id != INVALID_UNICHAR_ID) {
368 edge_array[edge_cnt] =
new TessLangModEdge(cntxt_, dawg,
369 child_edge->StartEdge(), child_edge->EndEdge(), class_id);
371 if (edge_array[edge_cnt] !=
NULL) {
372 reinterpret_cast<TessLangModEdge *
>(edge_array[edge_cnt])->
373 SetEdgeMask(edge_mask);
386 int TessLangModel::NumberEdges(
EDGE_REF edge_ref, LangModEdge **edge_array) {
407 new_state = num_state_machine_[state][lit];
412 if (new_state == state) {
413 new_repeat_cnt = repeat_cnt + 1;
419 if (new_repeat_cnt > num_max_repeat_[state]) {
427 edge_cnt += Edges(literal_str_[lit]->c_str(), number_dawg_,
428 new_edge_ref, 0, edge_array + edge_cnt);
435 bool TessLangModel::LoadLangModelElements(
const string &lm_params) {
438 vector<string> str_vec;
440 for (
int entry = 0; entry < str_vec.size(); entry++) {
441 vector<string> tokens;
444 if (tokens.size() != 2)
446 if (tokens[0] ==
"LeadPunc") {
447 lead_punc_ = tokens[1];
448 }
else if (tokens[0] ==
"TrailPunc") {
449 trail_punc_ = tokens[1];
450 }
else if (tokens[0] ==
"NumLeadPunc") {
451 num_lead_punc_ = tokens[1];
452 }
else if (tokens[0] ==
"NumTrailPunc") {
453 num_trail_punc_ = tokens[1];
454 }
else if (tokens[0] ==
"Operators") {
455 operators_ = tokens[1];
456 }
else if (tokens[0] ==
"Digits") {
458 }
else if (tokens[0] ==
"Alphas") {
473 literal_str_[0] = &num_lead_punc_;
474 literal_str_[1] = &num_trail_punc_;
475 literal_str_[2] = &digits_;
476 literal_str_[3] = &operators_;
477 literal_str_[4] = &alphas_;
492 for (
int i = 0; i < len; ++i) {
494 if (class_id != INVALID_UNICHAR_ID) {
495 clean_str32[clean_len] = lm_str32[i];
499 clean_str32[clean_len] = 0;
500 if (clean_len < len) {
504 delete [] clean_str32;
507 int TessLangModel::NumDawgs()
const {
508 return (word_dawgs_ !=
NULL) ?
514 const Dawg *TessLangModel::GetDawg(
int index)
const {
515 if (word_dawgs_ !=
NULL) {
517 return (*word_dawgs_)[index];
519 ASSERT_HOST(index < cntxt_->TesseractObject()->getDict().NumDawgs());
FILE * GetDataFilePtr() const
TessLangModel(const string &lm_params, const string &data_file_path, bool load_system_dawg, TessdataManager *tessdata_manager, CubeRecoContext *cntxt)
void RemoveInvalidCharacters(string *lm_str)
#define NUMBER_LITERAL_SHIFT
static int Prob2Cost(double prob_val)
#define IsTrailingPuncEdge(edge_mask)
tesseract::Tesseract * TesseractObject() const
basic_string< char_32 > string_32
#define NUMBER_REPEAT_SHIFT
GenericVector< Dawg * > DawgVector
const char_32 * EdgeString() const
virtual void SetRoot(bool flag)=0
#define TrailingPuncCount(edge_mask)
#define NUMBER_REPEAT_MASK
static int CreateChildren(CubeRecoContext *cntxt, const Dawg *edges, NODE_REF edge_reg, LangModEdge **lm_edges)
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
int ClassID(const char_32 *str) const
EDGE_REF EdgeMask() const
#define NUMBER_STATE_SHIFT
#define NUMBER_STATE_MASK
const int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
static int StrLen(const char_32 *str)
bool IsLeadingPunc(char_32 ch)
const Dawg * GetDawg() const
LangModEdge ** GetEdges(CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
bool SeekToStart(TessdataType tessdata_type)
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
CharSet * CharacterSet() const
bool IsTrailingPunc(char_32 ch)
#define LEAD_PUNC_EDGE_REF_MASK
bool IsValidSequence(const char_32 *sequence, bool eow_flag, LangModEdge **final_edge=NULL)
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
#define TrailingPuncEdgeMask(Cnt)
const string & Lang() const