tesseract  4.0.0-1-g2a2b
tesseract::LigatureTable Class Reference

#include <ligature_table.h>

Public Member Functions

std::string AddLigatures (const std::string &str, const PangoFontInfo *font) const
 
std::string RemoveLigatures (const std::string &str) const
 
std::string RemoveCustomLigatures (const std::string &str) const
 
const LigHashnorm_to_lig_table () const
 
const LigHashlig_to_norm_table () const
 

Static Public Member Functions

static LigatureTableGet ()
 

Protected Member Functions

 LigatureTable ()
 
void Init ()
 

Protected Attributes

LigHash norm_to_lig_table_
 
LigHash lig_to_norm_table_
 
int min_lig_length_
 
int max_lig_length_
 
int min_norm_length_
 
int max_norm_length_
 

Static Protected Attributes

static std::unique_ptr< LigatureTableinstance_
 

Detailed Description

Definition at line 38 of file ligature_table.h.

Constructor & Destructor Documentation

◆ LigatureTable()

tesseract::LigatureTable::LigatureTable ( )
protected

Member Function Documentation

◆ AddLigatures()

std::string tesseract::LigatureTable::AddLigatures ( const std::string &  str,
const PangoFontInfo font 
) const

Definition at line 156 of file ligature_table.cpp.

157  {
158  std::string result;
159  int len = str.size();
160  int step = 0;
161  int i = 0;
162  for (i = 0; i < len - min_norm_length_ + 1; i += step) {
163  step = 0;
164  for (int liglen = max_norm_length_; liglen >= min_norm_length_; --liglen) {
165  if (i + liglen <= len) {
166  std::string lig_cand = str.substr(i, liglen);
167  LigHash::const_iterator it = norm_to_lig_table_.find(lig_cand);
168  if (it != norm_to_lig_table_.end()) {
169  tlog(3, "Considering %s -> %s\n", lig_cand.c_str(),
170  it->second.c_str());
171  if (font) {
172  // Test for renderability.
173  if (!font->CanRenderString(it->second.data(), it->second.length()))
174  continue; // Not renderable
175  }
176  // Found a match so convert it.
177  step = liglen;
178  result += it->second;
179  tlog(2, "Substituted %s -> %s\n", lig_cand.c_str(),
180  it->second.c_str());
181  break;
182  }
183  }
184  }
185  if (step == 0) {
186  result += str[i];
187  step = 1;
188  }
189  }
190  result += str.substr(i, len - i);
191  return result;
192 }
#define tlog(level,...)
Definition: tlog.h:33

◆ Get()

LigatureTable * tesseract::LigatureTable::Get ( )
static

Definition at line 52 of file ligature_table.cpp.

52  {
53  if (instance_ == nullptr) {
54  instance_.reset(new LigatureTable());
55  instance_->Init();
56  }
57  return instance_.get();
58 }
static std::unique_ptr< LigatureTable > instance_

◆ Init()

void tesseract::LigatureTable::Init ( )
protected

Definition at line 63 of file ligature_table.cpp.

63  {
64  if (norm_to_lig_table_.empty()) {
65  for (char32 lig = kMinLigature; lig <= kMaxLigature; ++lig) {
66  // For each char in the range, convert to utf8, nfkc normalize, and if
67  // the strings are different put the both mappings in the hash_maps.
68  std::string lig8 = EncodeAsUTF8(lig);
69  icu::UnicodeString unicode_lig8(static_cast<UChar32>(lig));
70  icu::UnicodeString normed8_result;
71  icu::ErrorCode status;
72  icu::Normalizer::normalize(unicode_lig8, UNORM_NFKC, 0, normed8_result,
73  status);
74  std::string normed8;
75  normed8_result.toUTF8String(normed8);
76  // The icu::Normalizer maps the "LONG S T" ligature to "st". Correct that
77  // here manually so that AddLigatures() will work as desired.
78  if (lig8 == "\uFB05")
79  normed8 = "ſt";
80  int lig_length = lig8.length();
81  int norm_length = normed8.size();
82  if (normed8 != lig8 && lig_length > 1 && norm_length > 1) {
83  norm_to_lig_table_[normed8] = lig8;
84  lig_to_norm_table_[lig8] = normed8;
85  if (min_lig_length_ == 0 || lig_length < min_lig_length_)
86  min_lig_length_ = lig_length;
87  if (lig_length > max_lig_length_)
88  max_lig_length_ = lig_length;
89  if (min_norm_length_ == 0 || norm_length < min_norm_length_)
90  min_norm_length_ = norm_length;
91  if (norm_length > max_norm_length_)
92  max_norm_length_ = norm_length;
93  }
94  }
95  // Add custom extra ligatures.
96  for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
99  int norm_length = strlen(UNICHARSET::kCustomLigatures[i][0]);
100  if (min_norm_length_ == 0 || norm_length < min_norm_length_)
101  min_norm_length_ = norm_length;
102  if (norm_length > max_norm_length_)
103  max_norm_length_ = norm_length;
104 
107  }
108  }
109 }
const int kMaxLigature
signed int char32
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:151
const int kMinLigature

◆ lig_to_norm_table()

const LigHash& tesseract::LigatureTable::lig_to_norm_table ( ) const
inline

Definition at line 55 of file ligature_table.h.

55  {
56  return lig_to_norm_table_;
57  }

◆ norm_to_lig_table()

const LigHash& tesseract::LigatureTable::norm_to_lig_table ( ) const
inline

Definition at line 52 of file ligature_table.h.

52  {
53  return norm_to_lig_table_;
54  }

◆ RemoveCustomLigatures()

std::string tesseract::LigatureTable::RemoveCustomLigatures ( const std::string &  str) const

Definition at line 130 of file ligature_table.cpp.

130  {
131  std::string result;
132  UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
133  UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
134  char tmp[5];
135  int len;
136  int norm_ind;
137  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
138  len = it.get_utf8(tmp);
139  tmp[len] = '\0';
140  norm_ind = -1;
141  for (int i = 0;
142  UNICHARSET::kCustomLigatures[i][0] != nullptr && norm_ind < 0; ++i) {
143  if (!strcmp(tmp, UNICHARSET::kCustomLigatures[i][1])) {
144  norm_ind = i;
145  }
146  }
147  if (norm_ind >= 0) {
148  result += UNICHARSET::kCustomLigatures[norm_ind][0];
149  } else {
150  result += tmp;
151  }
152  }
153  return result;
154 }
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:151
int get_utf8(char *buf) const
Definition: unichar.cpp:176
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:202
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:206

◆ RemoveLigatures()

std::string tesseract::LigatureTable::RemoveLigatures ( const std::string &  str) const

Definition at line 111 of file ligature_table.cpp.

111  {
112  std::string result;
113  UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
114  UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
115  char tmp[5];
116  int len;
117  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
118  len = it.get_utf8(tmp);
119  tmp[len] = '\0';
120  LigHash::const_iterator lig_it = lig_to_norm_table_.find(tmp);
121  if (lig_it != lig_to_norm_table_.end()) {
122  result += lig_it->second;
123  } else {
124  result += tmp;
125  }
126  }
127  return result;
128 }
int get_utf8(char *buf) const
Definition: unichar.cpp:176
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:202
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:206

Member Data Documentation

◆ instance_

std::unique_ptr< LigatureTable > tesseract::LigatureTable::instance_
staticprotected

Definition at line 65 of file ligature_table.h.

◆ lig_to_norm_table_

LigHash tesseract::LigatureTable::lig_to_norm_table_
protected

Definition at line 67 of file ligature_table.h.

◆ max_lig_length_

int tesseract::LigatureTable::max_lig_length_
protected

Definition at line 69 of file ligature_table.h.

◆ max_norm_length_

int tesseract::LigatureTable::max_norm_length_
protected

Definition at line 71 of file ligature_table.h.

◆ min_lig_length_

int tesseract::LigatureTable::min_lig_length_
protected

Definition at line 68 of file ligature_table.h.

◆ min_norm_length_

int tesseract::LigatureTable::min_norm_length_
protected

Definition at line 70 of file ligature_table.h.

◆ norm_to_lig_table_

LigHash tesseract::LigatureTable::norm_to_lig_table_
protected

Definition at line 66 of file ligature_table.h.


The documentation for this class was generated from the following files: