tesseract  4.0.0-1-g2a2b
ligature_table.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: ligature_table.cpp
3  * Description: Class for adding and removing optional latin ligatures,
4  * conditional on codepoint support by a specified font
5  * (if specified).
6  * Author: Ranjith Unnikrishnan
7  * Created: Mon Nov 18 2013
8  *
9  * (C) Copyright 2013, Google Inc.
10  * Licensed under the Apache License, Version 2.0 (the "License");
11  * you may not use this file except in compliance with the License.
12  * You may obtain a copy of the License at
13  * http://www.apache.org/licenses/LICENSE-2.0
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  *
20  **********************************************************************/
21 
22 #include "ligature_table.h"
23 
24 #include <utility>
25 
26 #include "pango_font_info.h"
27 #include "tlog.h"
28 #include "unichar.h"
29 #include "unicharset.h"
30 #include "unicode/errorcode.h" // from libicu
31 #include "unicode/normlzr.h" // from libicu
32 #include "unicode/unistr.h" // from libicu
33 #include "unicode/utypes.h" // from libicu
34 
35 namespace tesseract {
36 
37 static std::string EncodeAsUTF8(const char32 ch32) {
38  UNICHAR uni_ch(ch32);
39  return std::string(uni_ch.utf8(), uni_ch.utf8_len());
40 }
41 
42 // Range of optional latin ligature characters in Unicode to build ligatures
43 // from. Note that this range does not contain the custom ligatures that we
44 // encode in the private use area.
45 const int kMinLigature = 0xfb00;
46 const int kMaxLigature = 0xfb17; // Don't put the wide Hebrew letters in.
47 
48 /* static */
49 std::unique_ptr<LigatureTable> LigatureTable::instance_;
50 
51 /* static */
53  if (instance_ == nullptr) {
54  instance_.reset(new LigatureTable());
55  instance_->Init();
56  }
57  return instance_.get();
58 }
59 
60 LigatureTable::LigatureTable() : min_lig_length_(0), max_lig_length_(0),
61  min_norm_length_(0), max_norm_length_(0) {}
62 
64  if (norm_to_lig_table_.empty()) {
65  for (char32 lig = kMinLigature; lig <= kMaxLigature; ++lig) {
66  // For each char in the range, convert to utf8, nfkc normalize, and if
67  // the strings are different put the both mappings in the hash_maps.
68  std::string lig8 = EncodeAsUTF8(lig);
69  icu::UnicodeString unicode_lig8(static_cast<UChar32>(lig));
70  icu::UnicodeString normed8_result;
71  icu::ErrorCode status;
72  icu::Normalizer::normalize(unicode_lig8, UNORM_NFKC, 0, normed8_result,
73  status);
74  std::string normed8;
75  normed8_result.toUTF8String(normed8);
76  // The icu::Normalizer maps the "LONG S T" ligature to "st". Correct that
77  // here manually so that AddLigatures() will work as desired.
78  if (lig8 == "\uFB05")
79  normed8 = "ſt";
80  int lig_length = lig8.length();
81  int norm_length = normed8.size();
82  if (normed8 != lig8 && lig_length > 1 && norm_length > 1) {
83  norm_to_lig_table_[normed8] = lig8;
84  lig_to_norm_table_[lig8] = normed8;
85  if (min_lig_length_ == 0 || lig_length < min_lig_length_)
86  min_lig_length_ = lig_length;
87  if (lig_length > max_lig_length_)
88  max_lig_length_ = lig_length;
89  if (min_norm_length_ == 0 || norm_length < min_norm_length_)
90  min_norm_length_ = norm_length;
91  if (norm_length > max_norm_length_)
92  max_norm_length_ = norm_length;
93  }
94  }
95  // Add custom extra ligatures.
96  for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
99  int norm_length = strlen(UNICHARSET::kCustomLigatures[i][0]);
100  if (min_norm_length_ == 0 || norm_length < min_norm_length_)
101  min_norm_length_ = norm_length;
102  if (norm_length > max_norm_length_)
103  max_norm_length_ = norm_length;
104 
107  }
108  }
109 }
110 
111 std::string LigatureTable::RemoveLigatures(const std::string& str) const {
112  std::string result;
113  UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
114  UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
115  char tmp[5];
116  int len;
117  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
118  len = it.get_utf8(tmp);
119  tmp[len] = '\0';
120  LigHash::const_iterator lig_it = lig_to_norm_table_.find(tmp);
121  if (lig_it != lig_to_norm_table_.end()) {
122  result += lig_it->second;
123  } else {
124  result += tmp;
125  }
126  }
127  return result;
128 }
129 
130 std::string LigatureTable::RemoveCustomLigatures(const std::string& str) const {
131  std::string result;
132  UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
133  UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
134  char tmp[5];
135  int len;
136  int norm_ind;
137  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
138  len = it.get_utf8(tmp);
139  tmp[len] = '\0';
140  norm_ind = -1;
141  for (int i = 0;
142  UNICHARSET::kCustomLigatures[i][0] != nullptr && norm_ind < 0; ++i) {
143  if (!strcmp(tmp, UNICHARSET::kCustomLigatures[i][1])) {
144  norm_ind = i;
145  }
146  }
147  if (norm_ind >= 0) {
148  result += UNICHARSET::kCustomLigatures[norm_ind][0];
149  } else {
150  result += tmp;
151  }
152  }
153  return result;
154 }
155 
156 std::string LigatureTable::AddLigatures(const std::string& str,
157  const PangoFontInfo* font) const {
158  std::string result;
159  int len = str.size();
160  int step = 0;
161  int i = 0;
162  for (i = 0; i < len - min_norm_length_ + 1; i += step) {
163  step = 0;
164  for (int liglen = max_norm_length_; liglen >= min_norm_length_; --liglen) {
165  if (i + liglen <= len) {
166  std::string lig_cand = str.substr(i, liglen);
167  LigHash::const_iterator it = norm_to_lig_table_.find(lig_cand);
168  if (it != norm_to_lig_table_.end()) {
169  tlog(3, "Considering %s -> %s\n", lig_cand.c_str(),
170  it->second.c_str());
171  if (font) {
172  // Test for renderability.
173  if (!font->CanRenderString(it->second.data(), it->second.length()))
174  continue; // Not renderable
175  }
176  // Found a match so convert it.
177  step = liglen;
178  result += it->second;
179  tlog(2, "Substituted %s -> %s\n", lig_cand.c_str(),
180  it->second.c_str());
181  break;
182  }
183  }
184  }
185  if (step == 0) {
186  result += str[i];
187  step = 1;
188  }
189  }
190  result += str.substr(i, len - i);
191  return result;
192 }
193 
194 } // namespace tesseract
const int kMaxLigature
signed int char32
signed int char32
Definition: unichar.h:52
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:151
int get_utf8(char *buf) const
Definition: unichar.cpp:176
std::string RemoveLigatures(const std::string &str) const
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:202
static std::unique_ptr< LigatureTable > instance_
#define tlog(level,...)
Definition: tlog.h:33
bool CanRenderString(const char *utf8_word, int len, std::vector< std::string > *graphemes) const
static LigatureTable * Get()
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:206
const int kMinLigature
std::string AddLigatures(const std::string &str, const PangoFontInfo *font) const
std::string RemoveCustomLigatures(const std::string &str) const