tesseract  5.0.0-alpha-619-ge9db
normstrngs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: normstrngs.cpp
3  * Description: Utilities to normalize and manipulate UTF-32 and
4  * UTF-8 strings.
5  * Author: Ranjith Unnikrishnan
6  * Created: Thu July 4 2013
7  *
8  * (C) Copyright 2013, Google Inc.
9  * Licensed under the Apache License, Version 2.0 (the "License");
10  * you may not use this file except in compliance with the License.
11  * You may obtain a copy of the License at
12  * http://www.apache.org/licenses/LICENSE-2.0
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  **********************************************************************/
20 
21 #include "normstrngs.h"
22 
23 #include <string>
24 #include <unordered_map>
25 #include <vector>
26 
27 #include "errcode.h"
28 #include "icuerrorcode.h"
29 #include <tesseract/unichar.h>
30 #include "unicode/normalizer2.h" // From libicu
31 #include "unicode/translit.h" // From libicu
32 #include "unicode/uchar.h" // From libicu
33 #include "unicode/unorm2.h" // From libicu
34 #include "unicode/uscript.h" // From libicu
35 
36 namespace tesseract {
37 
38 static bool is_hyphen_punc(const char32 ch) {
39  static const int kNumHyphenPuncUnicodes = 13;
40  static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
41  '-', 0x2010, 0x2011, 0x2012,
42  0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
43  0x207b, // superscript minus
44  0x208b, // subscript minus
45  0x2212, // minus sign
46  0xfe58, // small em dash
47  0xfe63, // small hyphen-minus
48  0xff0d, // fullwidth hyphen-minus
49  };
50  for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
51  if (kHyphenPuncUnicodes[i] == ch) return true;
52  }
53  return false;
54 }
55 
56 static bool is_single_quote(const char32 ch) {
57  static const int kNumSingleQuoteUnicodes = 8;
58  static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
59  '\'', '`',
60  0x2018, // left single quotation mark (English, others)
61  0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
62  // We may have to introduce a comma set with 0x201a
63  0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
64  0x2032, // prime
65  0x300C, // left corner bracket (East Asian languages)
66  0xFF07, // fullwidth apostrophe
67  };
68  for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
69  if (kSingleQuoteUnicodes[i] == ch) return true;
70  }
71  return false;
72 }
73 
74 static bool is_double_quote(const char32 ch) {
75  static const int kNumDoubleQuoteUnicodes = 8;
76  static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
77  '"',
78  0x201C, // left double quotation mark (English, others)
79  0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
80  0x201F, // double high-reversed-9 quotation mark (PropList.txt)
81  0x2033, // double prime
82  0x301D, // reversed double prime quotation mark (East Asian langs,
83  // horiz.)
84  0x301E, // close double prime (East Asian languages written horizontally)
85  0xFF02, // fullwidth quotation mark
86  };
87  for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
88  if (kDoubleQuoteUnicodes[i] == ch) return true;
89  }
90  return false;
91 }
92 
93 // Helper runs a standard unicode normalization, optional OCR normalization,
94 // and leaves the result as char32 for subsequent processing.
95 static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
96  const char* str8,
97  std::vector<char32>* normed32) {
98  // Convert to ICU string for unicode normalization.
99  icu::UnicodeString uch_str(str8, "UTF-8");
100  IcuErrorCode error_code;
101  // Convert the enum to the new weird icu representation.
102  const char* norm_type =
103  u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC
104  ? "nfkc"
105  : "nfc";
106  UNormalization2Mode compose =
107  u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC
108  ? UNORM2_COMPOSE
109  : UNORM2_DECOMPOSE;
110  // Pointer to singleton does not require deletion.
111  const icu::Normalizer2* normalizer =
112  icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code);
113  error_code.assertSuccess();
114  error_code.reset();
115  icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);
116  error_code.assertSuccess();
117  // Convert to char32 for output. OCR normalization if required.
118  normed32->reserve(norm_str.length()); // An approximation.
119  for (int offset = 0; offset < norm_str.length();
120  offset = norm_str.moveIndex32(offset, 1)) {
121  char32 ch = norm_str.char32At(offset);
122  // Skip all ZWS, RTL and LTR marks.
123  if (Validator::IsZeroWidthMark(ch)) continue;
124  if (ocr_normalize == OCRNorm::kNormalize) ch = OCRNormalize(ch);
125  normed32->push_back(ch);
126  }
127 }
128 
129 // Helper removes joiners from strings that contain no letters.
130 static void StripJoiners(std::vector<char32>* str32) {
131  for (char32 ch : *str32) {
132  if (u_isalpha(ch)) return;
133  }
134  int len = 0;
135  for (char32 ch : *str32) {
136  if (ch != Validator::kZeroWidthJoiner &&
138  (*str32)[len++] = ch;
139  }
140  }
141  str32->resize(len);
142 }
143 
144 // Normalizes a UTF8 string according to the given modes. Returns true on
145 // success. If false is returned, some failure or invalidity was present, and
146 // the result string is produced on a "best effort" basis.
147 bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
148  GraphemeNorm grapheme_normalize, const char* str8,
149  std::string* normalized) {
150  std::vector<char32> normed32;
151  NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
152  if (grapheme_normalize == GraphemeNorm::kNormalize) {
153  StripJoiners(&normed32);
154  std::vector<std::vector<char32>> graphemes;
155  bool success = Validator::ValidateCleanAndSegment(
156  GraphemeNormMode::kSingleString, false, normed32, &graphemes);
157  if (graphemes.empty() || graphemes[0].empty()) {
158  success = false;
159  } else if (normalized != nullptr) {
160  *normalized = UNICHAR::UTF32ToUTF8(graphemes[0]);
161  }
162  return success;
163  }
164  if (normalized != nullptr) *normalized = UNICHAR::UTF32ToUTF8(normed32);
165  return true;
166 }
167 
168 // Normalizes a UTF8 string according to the given modes and splits into
169 // graphemes according to g_mode. Returns true on success. If false is returned,
170 // some failure or invalidity was present, and the result string is produced on
171 // a "best effort" basis.
172 bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
173  GraphemeNormMode g_mode, bool report_errors,
174  const char* str8,
175  std::vector<std::string>* graphemes) {
176  std::vector<char32> normed32;
177  NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
178  StripJoiners(&normed32);
179  std::vector<std::vector<char32>> graphemes32;
180  bool success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
181  normed32, &graphemes32);
182  if (g_mode != GraphemeNormMode::kSingleString && success) {
183  // If we modified the string to clean it up, the segmentation may not be
184  // correct, so check for changes and do it again.
185  std::vector<char32> cleaned32;
186  for (const auto& g : graphemes32) {
187  cleaned32.insert(cleaned32.end(), g.begin(), g.end());
188  }
189  if (cleaned32 != normed32) {
190  graphemes32.clear();
191  success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
192  cleaned32, &graphemes32);
193  }
194  }
195  graphemes->clear();
196  graphemes->reserve(graphemes32.size());
197  for (const auto& grapheme : graphemes32) {
198  graphemes->push_back(UNICHAR::UTF32ToUTF8(grapheme));
199  }
200  return success;
201 }
202 
203 // Apply just the OCR-specific normalizations and return the normalized char.
205  if (is_hyphen_punc(ch))
206  return '-';
207  else if (is_single_quote(ch))
208  return '\'';
209  else if (is_double_quote(ch))
210  return '"';
211  return ch;
212 }
213 
214 bool IsOCREquivalent(char32 ch1, char32 ch2) {
215  return OCRNormalize(ch1) == OCRNormalize(ch2);
216 }
217 
218 bool IsValidCodepoint(const char32 ch) {
219  // In the range [0, 0xD800) or [0xE000, 0x10FFFF]
220  return (static_cast<uint32_t>(ch) < 0xD800) || (ch >= 0xE000 && ch <= 0x10FFFF);
221 }
222 
223 bool IsWhitespace(const char32 ch) {
224  ASSERT_HOST_MSG(IsValidCodepoint(ch), "Invalid Unicode codepoint: 0x%x\n",
225  ch);
226  return u_isUWhiteSpace(static_cast<UChar32>(ch));
227 }
228 
229 bool IsUTF8Whitespace(const char* text) {
230  return SpanUTF8Whitespace(text) == strlen(text);
231 }
232 
233 unsigned int SpanUTF8Whitespace(const char* text) {
234  int n_white = 0;
235  for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
236  it != UNICHAR::end(text, strlen(text)); ++it) {
237  if (!IsWhitespace(*it)) break;
238  n_white += it.utf8_len();
239  }
240  return n_white;
241 }
242 
243 unsigned int SpanUTF8NotWhitespace(const char* text) {
244  int n_notwhite = 0;
245  for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
246  it != UNICHAR::end(text, strlen(text)); ++it) {
247  if (IsWhitespace(*it)) break;
248  n_notwhite += it.utf8_len();
249  }
250  return n_notwhite;
251 }
252 
253 bool IsInterchangeValid(const char32 ch) {
254  return IsValidCodepoint(ch) &&
255  !(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters.
256  !(ch >= 0xFFFE && ch <= 0xFFFF) && !(ch >= 0x1FFFE && ch <= 0x1FFFF) &&
257  !(ch >= 0x2FFFE && ch <= 0x2FFFF) &&
258  !(ch >= 0x3FFFE && ch <= 0x3FFFF) &&
259  !(ch >= 0x4FFFE && ch <= 0x4FFFF) &&
260  !(ch >= 0x5FFFE && ch <= 0x5FFFF) &&
261  !(ch >= 0x6FFFE && ch <= 0x6FFFF) &&
262  !(ch >= 0x7FFFE && ch <= 0x7FFFF) &&
263  !(ch >= 0x8FFFE && ch <= 0x8FFFF) &&
264  !(ch >= 0x9FFFE && ch <= 0x9FFFF) &&
265  !(ch >= 0xAFFFE && ch <= 0xAFFFF) &&
266  !(ch >= 0xBFFFE && ch <= 0xBFFFF) &&
267  !(ch >= 0xCFFFE && ch <= 0xCFFFF) &&
268  !(ch >= 0xDFFFE && ch <= 0xDFFFF) &&
269  !(ch >= 0xEFFFE && ch <= 0xEFFFF) &&
270  !(ch >= 0xFFFFE && ch <= 0xFFFFF) &&
271  !(ch >= 0x10FFFE && ch <= 0x10FFFF) &&
272  (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' ||
273  ch == '\f' || ch == '\t' || ch == '\r');
274 }
275 
276 bool IsInterchangeValid7BitAscii(const char32 ch) {
277  return IsValidCodepoint(ch) && ch <= 128 &&
278  (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' ||
279  ch == '\f' || ch == '\t' || ch == '\r');
280 }
281 
283  // Return unchanged if not in the fullwidth-halfwidth Unicode block.
284  if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) {
285  if (ch != 0x3000) return ch;
286  }
287  // Special case for fullwidth left and right "white parentheses".
288  if (ch == 0xFF5F) return 0x2985;
289  if (ch == 0xFF60) return 0x2986;
290  // Construct a full-to-half width transliterator.
291  IcuErrorCode error_code;
292  icu::UnicodeString uch_str(static_cast<UChar32>(ch));
293  const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
294  "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
295  error_code.assertSuccess();
296  error_code.reset();
297 
298  fulltohalf->transliterate(uch_str);
299  delete fulltohalf;
300  ASSERT_HOST(uch_str.length() != 0);
301  return uch_str[0];
302 }
303 
304 } // namespace tesseract
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::UnicodeNormMode
UnicodeNormMode
Definition: normstrngs.h:48
tesseract::NormalizeUTF8String
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:163
tesseract::OCRNorm
OCRNorm
Definition: normstrngs.h:57
tesseract::IsWhitespace
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:239
tesseract::UNICHAR::begin
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:204
tesseract::Validator::IsZeroWidthMark
static bool IsZeroWidthMark(char32 ch)
Definition: validator.h:102
tesseract::SpanUTF8Whitespace
unsigned int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:249
tesseract::UNICHAR::end
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:208
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::IsInterchangeValid
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:269
icuerrorcode.h
tesseract::UNICHAR::const_iterator
Definition: unichar.h:109
tesseract::NormalizeCleanAndSegmentUTF8
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:188
tesseract::IsValidCodepoint
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:234
tesseract::IsInterchangeValid7BitAscii
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:292
tesseract::IsOCREquivalent
bool IsOCREquivalent(char32 ch1, char32 ch2)
Definition: normstrngs.cpp:230
tesseract::OCRNormalize
char32 OCRNormalize(char32 ch)
Definition: normstrngs.cpp:220
ASSERT_HOST_MSG
#define ASSERT_HOST_MSG(x,...)
Definition: errcode.h:91
tesseract::GraphemeNormMode
GraphemeNormMode
Definition: validator.h:48
tesseract::Validator::kZeroWidthNonJoiner
static const char32 kZeroWidthNonJoiner
Definition: validator.h:110
tesseract::Validator::ValidateCleanAndSegment
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
Definition: validator.cpp:40
tesseract::GraphemeNormMode::kSingleString
tesseract::FullwidthToHalfwidth
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:298
tesseract::UNICHAR::UTF32ToUTF8
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:232
tesseract::Validator::kZeroWidthJoiner
static const char32 kZeroWidthJoiner
Definition: validator.h:111
tesseract::UnicodeNormMode::kNFKC
tesseract
Definition: baseapi.h:65
tesseract::SpanUTF8NotWhitespace
unsigned int SpanUTF8NotWhitespace(const char *text)
Definition: normstrngs.cpp:259
normstrngs.h
tesseract::IsUTF8Whitespace
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:245
unichar.h
tesseract::UnicodeNormMode::kNFC
char32
signed int char32
Definition: pango_font_info.h:33
errcode.h
tesseract::GraphemeNorm
GraphemeNorm
Definition: normstrngs.h:65
tesseract::OCRNorm::kNormalize
tesseract::UnicodeNormMode::kNFKD