tesseract  5.0.0-alpha-619-ge9db
unicharset.cpp
Go to the documentation of this file.
1 // File: unicharset.cpp
3 // Description: Unicode character/ligature set class.
4 // Author: Thomas Kielbus
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include "unicharset.h"
20 
21 #include <algorithm>
22 #include <cassert>
23 #include <cstdio>
24 #include <cstring>
25 #include <iomanip> // for std::setw
26 #include <locale> // for std::locale::classic
27 #include <sstream> // for std::istringstream, std::ostringstream
28 
29 #include "params.h"
30 #include <tesseract/serialis.h>
31 #include <tesseract/unichar.h>
32 
33 // TODO(rays) Move UNICHARSET to tesseract namespace.
34 using tesseract::char32;
35 using tesseract::UNICHAR;
36 
37 // Special character used in representing character fragments.
38 static const char kSeparator = '|';
39 // Special character used in representing 'natural' character fragments.
40 static const char kNaturalFlag = 'n';
41 
42 static const int ISALPHA_MASK = 0x1;
43 static const int ISLOWER_MASK = 0x2;
44 static const int ISUPPER_MASK = 0x4;
45 static const int ISDIGIT_MASK = 0x8;
46 static const int ISPUNCTUATION_MASK = 0x10;
47 
48 // Y coordinate threshold for determining cap-height vs x-height.
49 // TODO(rays) Bring the global definition down to the ccutil library level,
50 // so this constant is relative to some other constants.
51 static const int kMeanlineThreshold = 220;
52 // Let C be the number of alpha chars for which all tops exceed
53 // kMeanlineThreshold, and X the number of alpha chars for which all
54 // tops are below kMeanlineThreshold, then if X > C *
55 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
56 // half the alpha characters have upper or lower case, then the
57 // unicharset "has x-height".
58 const double kMinXHeightFraction = 0.25;
59 const double kMinCapHeightFraction = 0.05;
60 
61 /*static */
62 const char* UNICHARSET::kCustomLigatures[][2] = {
63  {"ct", "\uE003"}, // c + t -> U+E003
64  {"ſh", "\uE006"}, // long-s + h -> U+E006
65  {"ſi", "\uE007"}, // long-s + i -> U+E007
66  {"ſl", "\uE008"}, // long-s + l -> U+E008
67  {"ſſ", "\uE009"}, // long-s + long-s -> U+E009
68  {nullptr, nullptr}
69 };
70 
71 // List of mappings to make when ingesting strings from the outside.
72 // The substitutions clean up text that should exist for rendering of
73 // synthetic data, but not in the recognition set.
74 const char* UNICHARSET::kCleanupMaps[][2] = {
75  {"\u0640", ""}, // TATWEEL is deleted.
76  {"\ufb01", "fi"}, // fi ligature->fi pair.
77  {"\ufb02", "fl"}, // fl ligature->fl pair.
78  {nullptr, nullptr}};
79 
80 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
82  " ",
83  "Joined",
84  "|Broken|0|1"
85 };
86 
87 const char* UNICHARSET::null_script = "NULL";
88 
89 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
90  Init();
91 }
92 
93 // Initialize all properties to sensible default values.
94 void UNICHARSET::UNICHAR_PROPERTIES::Init() {
95  isalpha = false;
96  islower = false;
97  isupper = false;
98  isdigit = false;
99  ispunctuation = false;
100  isngram = false;
101  enabled = false;
102  SetRangesOpen();
103  script_id = 0;
104  other_case = 0;
105  mirror = 0;
106  normed = "";
107  direction = UNICHARSET::U_LEFT_TO_RIGHT;
108  fragment = nullptr;
109 }
110 
111 // Sets all ranges wide open. Initialization default in case there are
112 // no useful values available.
113 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
114  min_bottom = 0;
115  max_bottom = UINT8_MAX;
116  min_top = 0;
117  max_top = UINT8_MAX;
118  width = 0.0f;
119  width_sd = 0.0f;
120  bearing = 0.0f;
121  bearing_sd = 0.0f;
122  advance = 0.0f;
123  advance_sd = 0.0f;
124 }
125 
126 // Sets all ranges to empty. Used before expanding with font-based data.
127 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
128  min_bottom = UINT8_MAX;
129  max_bottom = 0;
130  min_top = UINT8_MAX;
131  max_top = 0;
132  width = 0.0f;
133  width_sd = 0.0f;
134  bearing = 0.0f;
135  bearing_sd = 0.0f;
136  advance = 0.0f;
137  advance_sd = 0.0f;
138 }
139 
140 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
141 // is empty.
142 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
143  return width == 0.0f || advance == 0.0f;
144 }
145 
146 // Expands the ranges with the ranges from the src properties.
147 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
148  const UNICHAR_PROPERTIES& src) {
149  UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
150  UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
151  UpdateRange(src.min_top, &min_top, &max_top);
152  UpdateRange(src.max_top, &min_top, &max_top);
153  if (src.width_sd > width_sd) {
154  width = src.width;
155  width_sd = src.width_sd;
156  }
157  if (src.bearing_sd > bearing_sd) {
158  bearing = src.bearing;
159  bearing_sd = src.bearing_sd;
160  }
161  if (src.advance_sd > advance_sd) {
162  advance = src.advance;
163  advance_sd = src.advance_sd;
164  }
165 }
166 
167 // Copies the properties from src into this.
168 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) {
169  // Apart from the fragment, everything else can be done with a default copy.
170  CHAR_FRAGMENT* saved_fragment = fragment;
171  *this = src; // Bitwise copy.
172  fragment = saved_fragment;
173 }
174 
176  unichars(nullptr),
177  ids(),
178  size_used(0),
179  size_reserved(0),
180  script_table(nullptr),
181  script_table_size_used(0) {
182  clear();
183  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
185  if (i == UNICHAR_JOINED)
186  set_isngram(i, true);
187  }
188 }
189 
191  clear();
192 }
193 
194 void UNICHARSET::reserve(int unichars_number) {
195  if (unichars_number > size_reserved) {
196  auto* unichars_new = new UNICHAR_SLOT[unichars_number];
197  for (int i = 0; i < size_used; ++i)
198  unichars_new[i] = unichars[i];
199  for (int j = size_used; j < unichars_number; ++j) {
200  unichars_new[j].properties.script_id = add_script(null_script);
201  }
202  delete[] unichars;
203  unichars = unichars_new;
204  size_reserved = unichars_number;
205  }
206 }
207 
209 UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
210  std::string cleaned =
211  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
212  return ids.contains(cleaned.data(), cleaned.size())
213  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
214  : INVALID_UNICHAR_ID;
215 }
216 
217 UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
218  int length) const {
219  assert(length > 0 && length <= UNICHAR_LEN);
220  std::string cleaned(unichar_repr, length);
221  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
222  return ids.contains(cleaned.data(), cleaned.size())
223  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
224  : INVALID_UNICHAR_ID;
225 }
226 
227 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
228 // while leaving the rest of the string encodable. Returns 0 if the
229 // beginning of the string is not encodable.
230 // WARNING: this function now encodes the whole string for precision.
231 // Use encode_string in preference to repeatedly calling step.
232 int UNICHARSET::step(const char* str) const {
233  GenericVector<UNICHAR_ID> encoding;
234  GenericVector<char> lengths;
235  encode_string(str, true, &encoding, &lengths, nullptr);
236  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
237  return lengths[0];
238 }
239 
240 // Return whether the given UTF-8 string is encodable with this UNICHARSET.
241 // If not encodable, write the first byte offset which cannot be converted
242 // into the second (return) argument.
243 bool UNICHARSET::encodable_string(const char *str,
244  int *first_bad_position) const {
245  GenericVector<UNICHAR_ID> encoding;
246  return encode_string(str, true, &encoding, nullptr, first_bad_position);
247 }
248 
249 // Encodes the given UTF-8 string with this UNICHARSET.
250 // Returns true if the encoding succeeds completely, false if there is at
251 // least one INVALID_UNICHAR_ID in the returned encoding, but in this case
252 // the rest of the string is still encoded.
253 // If lengths is not nullptr, then it is filled with the corresponding
254 // byte length of each encoded UNICHAR_ID.
255 // WARNING: Caller must guarantee that str has already been cleaned of codes
256 // that do not belong in the unicharset, or encoding may fail.
257 // Use CleanupString to perform the cleaning.
258 bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
259  GenericVector<UNICHAR_ID>* encoding,
260  GenericVector<char>* lengths,
261  int* encoded_length) const {
262  GenericVector<UNICHAR_ID> working_encoding;
263  GenericVector<char> working_lengths;
264  GenericVector<char> best_lengths;
265  encoding->truncate(0); // Just in case str is empty.
266  int str_length = strlen(str);
267  int str_pos = 0;
268  bool perfect = true;
269  while (str_pos < str_length) {
270  encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
271  &str_pos, encoding, &best_lengths);
272  if (str_pos < str_length) {
273  // This is a non-match. Skip one utf-8 character.
274  perfect = false;
275  if (give_up_on_failure) break;
276  int step = UNICHAR::utf8_step(str + str_pos);
277  if (step == 0) step = 1;
278  encoding->push_back(INVALID_UNICHAR_ID);
279  best_lengths.push_back(step);
280  str_pos += step;
281  working_encoding = *encoding;
282  working_lengths = best_lengths;
283  }
284  }
285  if (lengths != nullptr) *lengths = best_lengths;
286  if (encoded_length != nullptr) *encoded_length = str_pos;
287  return perfect;
288 }
289 
290 const char* UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
291  if (id == INVALID_UNICHAR_ID) {
292  return INVALID_UNICHAR;
293  }
294  ASSERT_HOST(id < this->size());
295  return unichars[id].representation;
296 }
297 
299  if (id == INVALID_UNICHAR_ID) {
300  return INVALID_UNICHAR;
301  }
302  ASSERT_HOST(id < this->size());
303  // Resolve from the kCustomLigatures table if this is a private encoding.
304  if (get_isprivate(id)) {
305  const char* ch = id_to_unichar(id);
306  for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {
307  if (!strcmp(ch, kCustomLigatures[i][1])) {
308  return kCustomLigatures[i][0];
309  }
310  }
311  }
312  // Otherwise return the stored representation.
313  return unichars[id].representation;
314 }
315 
316 // Return a STRING that reformats the utf8 str into the str followed
317 // by its hex unicodes.
319  STRING result = str;
320  result += " [";
321  int step = 1;
322  // Chop into unicodes and code each as hex.
323  for (int i = 0; str[i] != '\0'; i += step) {
324  char hex[sizeof(int) * 2 + 1];
325  step = UNICHAR::utf8_step(str + i);
326  if (step == 0) {
327  step = 1;
328  sprintf(hex, "%x", str[i]);
329  } else {
330  UNICHAR ch(str + i, step);
331  sprintf(hex, "%x", ch.first_uni());
332  }
333  result += hex;
334  result += " ";
335  }
336  result += "]";
337  return result;
338 }
339 
340 // Return a STRING containing debug information on the unichar, including
341 // the id_to_unichar, its hex unicodes and the properties.
343  if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
344  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
345  if (fragment) {
346  return fragment->to_string();
347  }
348  const char* str = id_to_unichar(id);
349  STRING result = debug_utf8_str(str);
350  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
351  if (get_isalpha(id)) {
352  if (get_islower(id))
353  result += "a";
354  else if (get_isupper(id))
355  result += "A";
356  else
357  result += "x";
358  }
359  // Append 0 if a digit.
360  if (get_isdigit(id)) {
361  result += "0";
362  }
363  // Append p is a punctuation symbol.
364  if (get_ispunctuation(id)) {
365  result += "p";
366  }
367  return result;
368 }
369 
370 // Sets the normed_ids vector from the normed string. normed_ids is not
371 // stored in the file, and needs to be set when the UNICHARSET is loaded.
373  unichars[unichar_id].properties.normed_ids.truncate(0);
374  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
375  unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
376  } else if (!encode_string(unichars[unichar_id].properties.normed.c_str(),
377  true, &unichars[unichar_id].properties.normed_ids,
378  nullptr, nullptr)) {
379  unichars[unichar_id].properties.normed_ids.truncate(0);
380  unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
381  }
382 }
383 
384 // Returns whether the unichar id represents a unicode value in the private use
385 // area. We use this range only internally to represent uncommon ligatures
386 // (eg. 'ct') that do not have regular unicode values.
387 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
388  UNICHAR uc(id_to_unichar(unichar_id), -1);
389  int uni = uc.first_uni();
390  return (uni >= 0xE000 && uni <= 0xF8FF);
391 }
392 
393 
394 // Sets all ranges to empty, so they can be expanded to set the values.
396  for (int id = 0; id < size_used; ++id) {
397  unichars[id].properties.SetRangesEmpty();
398  }
399 }
400 
401 // Sets all the properties for this unicharset given a src unicharset with
402 // everything set. The unicharsets don't have to be the same, and graphemes
403 // are correctly accounted for.
405  const UNICHARSET& src) {
406  for (int ch = start_index; ch < size_used; ++ch) {
407  const char* utf8 = id_to_unichar(ch);
408  UNICHAR_PROPERTIES properties;
409  if (src.GetStrProperties(utf8, &properties)) {
410  // Setup the script_id, other_case, and mirror properly.
411  const char* script = src.get_script_from_script_id(properties.script_id);
412  properties.script_id = add_script(script);
413  const char* other_case = src.id_to_unichar(properties.other_case);
414  if (contains_unichar(other_case)) {
415  properties.other_case = unichar_to_id(other_case);
416  } else {
417  properties.other_case = ch;
418  }
419  const char* mirror_str = src.id_to_unichar(properties.mirror);
420  if (contains_unichar(mirror_str)) {
421  properties.mirror = unichar_to_id(mirror_str);
422  } else {
423  properties.mirror = ch;
424  }
425  unichars[ch].properties.CopyFrom(properties);
426  set_normed_ids(ch);
427  }
428  }
429 }
430 
431 // Expands the tops and bottoms and widths for this unicharset given a
432 // src unicharset with ranges in it. The unicharsets don't have to be the
433 // same, and graphemes are correctly accounted for.
435  for (int ch = 0; ch < size_used; ++ch) {
436  const char* utf8 = id_to_unichar(ch);
437  UNICHAR_PROPERTIES properties;
438  if (src.GetStrProperties(utf8, &properties)) {
439  // Expand just the ranges from properties.
440  unichars[ch].properties.ExpandRangesFrom(properties);
441  }
442  }
443 }
444 
445 // Makes this a copy of src. Clears this completely first, so the automatic
446 // ids will not be present in this if not in src. Does NOT reorder the set!
448  clear();
449  for (int ch = 0; ch < src.size_used; ++ch) {
450  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
451  const char* utf8 = src.id_to_unichar(ch);
453  unichars[ch].properties.ExpandRangesFrom(src_props);
454  }
455  // Set properties, including mirror and other_case, WITHOUT reordering
456  // the unicharset.
458 }
459 
460 // For each id in src, if it does not occur in this, add it, as in
461 // SetPropertiesFromOther, otherwise expand the ranges, as in
462 // ExpandRangesFromOther.
464  int initial_used = size_used;
465  for (int ch = 0; ch < src.size_used; ++ch) {
466  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
467  const char* utf8 = src.id_to_unichar(ch);
468  int id = size_used;
469  if (contains_unichar(utf8)) {
470  id = unichar_to_id(utf8);
471  // Just expand current ranges.
472  unichars[id].properties.ExpandRangesFrom(src_props);
473  } else {
475  unichars[id].properties.SetRangesEmpty();
476  }
477  }
478  // Set properties, including mirror and other_case, WITHOUT reordering
479  // the unicharset.
480  PartialSetPropertiesFromOther(initial_used, src);
481 }
482 
483 // Returns true if the acceptable ranges of the tops of the characters do
484 // not overlap, making their x-height calculations distinct.
486  int overlap = std::min(unichars[id1].properties.max_top,
487  unichars[id2].properties.max_top) -
488  std::max(unichars[id1].properties.min_top,
489  unichars[id2].properties.min_top);
490  return overlap <= 0;
491 }
492 
493 // Internal recursive version of encode_string above.
494 // Seeks to encode the given string as a sequence of UNICHAR_IDs such that
495 // each UNICHAR_ID uses the least possible part of the utf8 str.
496 // It does this by depth-first tail recursion on increasing length matches
497 // to the UNICHARSET, saving the first encountered result that encodes the
498 // maximum total length of str. It stops on a failure to encode to make
499 // the overall process of encoding a partially failed string more efficient.
500 // See unicharset.h for definition of the args.
501 void UNICHARSET::encode_string(const char* str, int str_index, int str_length,
502  GenericVector<UNICHAR_ID>* encoding,
503  GenericVector<char>* lengths,
504  int* best_total_length,
505  GenericVector<UNICHAR_ID>* best_encoding,
506  GenericVector<char>* best_lengths) const {
507  if (str_index > *best_total_length) {
508  // This is the best result so far.
509  *best_total_length = str_index;
510  *best_encoding = *encoding;
511  if (best_lengths != nullptr)
512  *best_lengths = *lengths;
513  }
514  if (str_index == str_length) return;
515  int encoding_index = encoding->size();
516  // Find the length of the first matching unicharset member.
517  int length = ids.minmatch(str + str_index);
518  if (length == 0 || str_index + length > str_length) return;
519  do {
520  if (ids.contains(str + str_index, length)) {
521  // Successful encoding so far.
522  UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);
523  encoding->push_back(id);
524  lengths->push_back(length);
525  encode_string(str, str_index + length, str_length, encoding, lengths,
526  best_total_length, best_encoding, best_lengths);
527  if (*best_total_length == str_length)
528  return; // Tail recursion success!
529  // Failed with that length, truncate back and try again.
530  encoding->truncate(encoding_index);
531  lengths->truncate(encoding_index);
532  }
533  int step = UNICHAR::utf8_step(str + str_index + length);
534  if (step == 0) step = 1;
535  length += step;
536  } while (length <= UNICHAR_LEN && str_index + length <= str_length);
537 }
538 
539 // Gets the properties for a grapheme string, combining properties for
540 // multiple characters in a meaningful way where possible.
541 // Returns false if no valid match was found in the unicharset.
542 // NOTE that script_id, mirror, and other_case refer to this unicharset on
543 // return and will need translation if the target unicharset is different.
544 bool UNICHARSET::GetStrProperties(const char* utf8_str,
545  UNICHAR_PROPERTIES* props) const {
546  props->Init();
547  props->SetRangesEmpty();
548  int total_unicodes = 0;
549  GenericVector<UNICHAR_ID> encoding;
550  if (!encode_string(utf8_str, true, &encoding, nullptr, nullptr))
551  return false; // Some part was invalid.
552  for (int i = 0; i < encoding.size(); ++i) {
553  int id = encoding[i];
554  const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
555  // Logical OR all the bools.
556  if (src_props.isalpha) props->isalpha = true;
557  if (src_props.islower) props->islower = true;
558  if (src_props.isupper) props->isupper = true;
559  if (src_props.isdigit) props->isdigit = true;
560  if (src_props.ispunctuation) props->ispunctuation = true;
561  if (src_props.isngram) props->isngram = true;
562  if (src_props.enabled) props->enabled = true;
563  // Min/max the tops/bottoms.
564  UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
565  UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
566  UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
567  UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
568  float bearing = props->advance + src_props.bearing;
569  if (total_unicodes == 0 || bearing < props->bearing) {
570  props->bearing = bearing;
571  props->bearing_sd = props->advance_sd + src_props.bearing_sd;
572  }
573  props->advance += src_props.advance;
574  props->advance_sd += src_props.advance_sd;
575  // With a single width, just use the widths stored in the unicharset.
576  props->width = src_props.width;
577  props->width_sd = src_props.width_sd;
578  // Use the first script id, other_case, mirror, direction.
579  // Note that these will need translation, except direction.
580  if (total_unicodes == 0) {
581  props->script_id = src_props.script_id;
582  props->other_case = src_props.other_case;
583  props->mirror = src_props.mirror;
584  props->direction = src_props.direction;
585  }
586  // The normed string for the compound character is the concatenation of
587  // the normed versions of the individual characters.
588  props->normed += src_props.normed;
589  ++total_unicodes;
590  }
591  if (total_unicodes > 1) {
592  // Estimate the total widths from the advance - bearing.
593  props->width = props->advance - props->bearing;
594  props->width_sd = props->advance_sd + props->bearing_sd;
595  }
596  return total_unicodes > 0;
597 }
598 
599 // TODO(rays) clean-up the order of functions to match unicharset.h.
600 
601 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
602  unsigned int properties = 0;
603  if (this->get_isalpha(id))
604  properties |= ISALPHA_MASK;
605  if (this->get_islower(id))
606  properties |= ISLOWER_MASK;
607  if (this->get_isupper(id))
608  properties |= ISUPPER_MASK;
609  if (this->get_isdigit(id))
610  properties |= ISDIGIT_MASK;
611  if (this->get_ispunctuation(id))
612  properties |= ISPUNCTUATION_MASK;
613  return properties;
614 }
615 
617  if (this->get_isupper(id)) return 'A';
618  if (this->get_islower(id)) return 'a';
619  if (this->get_isalpha(id)) return 'x';
620  if (this->get_isdigit(id)) return '0';
621  if (this->get_ispunctuation(id)) return 'p';
622  return 0;
623 }
624 
625 void UNICHARSET::unichar_insert(const char* const unichar_repr,
626  OldUncleanUnichars old_style) {
627  if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
628  std::string cleaned =
629  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
630  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
631  const char* str = cleaned.c_str();
632  GenericVector<int> encoding;
633  if (!old_style_included_ &&
634  encode_string(str, true, &encoding, nullptr, nullptr))
635  return;
636  if (size_used == size_reserved) {
637  if (size_used == 0)
638  reserve(8);
639  else
640  reserve(2 * size_used);
641  }
642  int index = 0;
643  do {
644  if (index >= UNICHAR_LEN) {
645  fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
646  unichar_repr);
647  return;
648  }
649  unichars[size_used].representation[index++] = *str++;
650  } while (*str != '\0');
651  unichars[size_used].representation[index] = '\0';
652  this->set_script(size_used, null_script);
653  // If the given unichar_repr represents a fragmented character, set
654  // fragment property to a pointer to CHAR_FRAGMENT class instance with
655  // information parsed from the unichar representation. Use the script
656  // of the base unichar for the fragmented character if possible.
657  CHAR_FRAGMENT* frag =
658  CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
659  this->unichars[size_used].properties.fragment = frag;
660  if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
661  this->unichars[size_used].properties.script_id =
662  this->get_script(frag->get_unichar());
663  }
664  this->unichars[size_used].properties.enabled = true;
665  ids.insert(unichars[size_used].representation, size_used);
666  ++size_used;
667  }
668 }
669 
670 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
671  std::string cleaned =
672  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
673  return ids.contains(cleaned.data(), cleaned.size());
674 }
675 
676 bool UNICHARSET::contains_unichar(const char* const unichar_repr,
677  int length) const {
678  if (length == 0) {
679  return false;
680  }
681  std::string cleaned(unichar_repr, length);
682  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
683  return ids.contains(cleaned.data(), cleaned.size());
684 }
685 
686 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
687  const char* const unichar_repr) const {
688  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
689 }
690 
692  const int kFileBufSize = 1024;
693  char buffer[kFileBufSize + 1];
694  snprintf(buffer, kFileBufSize, "%d\n", this->size());
695  *str = buffer;
696  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
697  int min_bottom, max_bottom, min_top, max_top;
698  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
699  float width, width_sd;
700  get_width_stats(id, &width, &width_sd);
701  float bearing, bearing_sd;
702  get_bearing_stats(id, &bearing, &bearing_sd);
703  float advance, advance_sd;
704  get_advance_stats(id, &advance, &advance_sd);
705  unsigned int properties = this->get_properties(id);
706  if (strcmp(this->id_to_unichar(id), " ") == 0) {
707  snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
708  this->get_script_from_script_id(this->get_script(id)),
709  this->get_other_case(id));
710  *str += buffer;
711  } else {
712  std::ostringstream stream;
713  stream.imbue(std::locale::classic());
714  stream << this->id_to_unichar(id) << ' ' << properties << ' ' <<
715  min_bottom << ',' << max_bottom << ',' <<
716  min_top << ',' << max_top << ',' <<
717  width << ',' << width_sd << ',' <<
718  bearing << ',' << bearing_sd << ',' <<
719  advance << ',' << advance_sd << ' ' <<
720  this->get_script_from_script_id(this->get_script(id)) << ' ' <<
721  this->get_other_case(id) << ' ' <<
722  this->get_direction(id) << ' ' <<
723  this->get_mirror(id) << ' ' <<
724  this->get_normed_unichar(id) << "\t# " <<
725  this->debug_str(id).c_str() << '\n';
726  *str += stream.str().c_str();
727  }
728  }
729  return true;
730 }
731 
733  public:
734  LocalFilePointer(FILE *stream) : fp_(stream) {}
735  char *fgets(char *dst, int size) {
736  return ::fgets(dst, size, fp_);
737  }
738  private:
739  FILE *fp_;
740 };
741 
742 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
743  LocalFilePointer lfp(file);
744  using namespace std::placeholders; // for _1, _2
745  std::function<char*(char*, int)> fgets_cb =
746  std::bind(&LocalFilePointer::fgets, &lfp, _1, _2);
747  bool success = load_via_fgets(fgets_cb, skip_fragments);
748  return success;
749 }
750 
751 bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
752  using namespace std::placeholders; // for _1, _2
753  std::function<char*(char*, int)> fgets_cb =
754  std::bind(&tesseract::TFile::FGets, file, _1, _2);
755  bool success = load_via_fgets(fgets_cb, skip_fragments);
756  return success;
757 }
758 
759 bool UNICHARSET::load_via_fgets(std::function<char*(char*, int)> fgets_cb,
760  bool skip_fragments) {
761  int unicharset_size;
762  char buffer[256];
763 
764  this->clear();
765  if (fgets_cb(buffer, sizeof(buffer)) == nullptr ||
766  sscanf(buffer, "%d", &unicharset_size) != 1) {
767  return false;
768  }
769  this->reserve(unicharset_size);
770  for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
771  char unichar[256];
772  unsigned int properties;
773  char script[64];
774 
775  strncpy(script, null_script, sizeof(script) - 1);
776  int min_bottom = 0;
777  int max_bottom = UINT8_MAX;
778  int min_top = 0;
779  int max_top = UINT8_MAX;
780  float width = 0.0f;
781  float width_sd = 0.0f;
782  float bearing = 0.0f;
783  float bearing_sd = 0.0f;
784  float advance = 0.0f;
785  float advance_sd = 0.0f;
786  // TODO(eger): check that this default it ok
787  // after enabling BiDi iterator for Arabic.
788  int direction = UNICHARSET::U_LEFT_TO_RIGHT;
789  UNICHAR_ID other_case = unicharset_size;
790  UNICHAR_ID mirror = unicharset_size;
791  if (fgets_cb(buffer, sizeof (buffer)) == nullptr) {
792  return false;
793  }
794  char normed[64];
795  normed[0] = '\0';
796  std::istringstream stream(buffer);
797  stream.imbue(std::locale::classic());
798  // 标 1 0,255,0,255,0,0,0,0,0,0 Han 68 0 68 标 # 标 [6807 ]x
799  //stream.flags(std::ios::hex);
800  stream >> std::setw(255) >> unichar >> std::hex >> properties >> std::dec;
801  //stream.flags(std::ios::dec);
802  if (stream.fail()) {
803  fprintf(stderr, "%s:%u failed\n", __FILE__, __LINE__);
804  return false;
805  }
806  auto position = stream.tellg();
807  stream.seekg(position);
808  char c1, c2, c3, c4, c5, c6, c7, c8, c9;
809  stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
810  width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
811  advance >> c9 >> advance_sd >> std::setw(63) >> script >>
812  other_case >> direction >> mirror >> std::setw(63) >> normed;
813  if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
814  c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
815  stream.clear();
816  stream.seekg(position);
817  stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
818  width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
819  advance >> c9 >> advance_sd >> std::setw(63) >> script >>
820  other_case >> direction >> mirror;
821  if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
822  c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
823  stream.clear();
824  stream.seekg(position);
825  stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
826  std::setw(63) >> script >> other_case >> direction >> mirror;
827  if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
828  stream.clear();
829  stream.seekg(position);
830  stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
831  std::setw(63) >> script >> other_case;
832  if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
833  stream.clear();
834  stream.seekg(position);
835  stream >> std::setw(63) >> script >> other_case;
836  if (stream.fail()) {
837  stream.clear();
838  stream.seekg(position);
839  stream >> std::setw(63) >> script;
840  }
841  }
842  }
843  }
844  }
845 
846  // Skip fragments if needed.
847  CHAR_FRAGMENT *frag = nullptr;
848  if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
849  int num_pieces = frag->get_total();
850  delete frag;
851  // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
852  if (num_pieces > 1)
853  continue;
854  }
855  // Insert unichar into unicharset and set its properties.
856  if (strcmp(unichar, "NULL") == 0)
857  this->unichar_insert(" ");
858  else
860 
861  this->set_isalpha(id, properties & ISALPHA_MASK);
862  this->set_islower(id, properties & ISLOWER_MASK);
863  this->set_isupper(id, properties & ISUPPER_MASK);
864  this->set_isdigit(id, properties & ISDIGIT_MASK);
865  this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
866  this->set_isngram(id, false);
867  this->set_script(id, script);
868  this->unichars[id].properties.enabled = true;
869  this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
870  this->set_width_stats(id, width, width_sd);
871  this->set_bearing_stats(id, bearing, bearing_sd);
872  this->set_advance_stats(id, advance, advance_sd);
873  this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
874  this->set_other_case(
875  id, (other_case < unicharset_size) ? other_case : id);
876  this->set_mirror(id, (mirror < unicharset_size) ? mirror : id);
877  this->set_normed(id, normed[0] != '\0' ? normed : unichar);
878  }
879  post_load_setup();
880  return true;
881 }
882 
883 // Sets up internal data after loading the file, based on the char
884 // properties. Called from load_from_file, but also needs to be run
885 // during set_unicharset_properties.
887  // Number of alpha chars with the case property minus those without,
888  // in order to determine that half the alpha chars have case.
889  int net_case_alphas = 0;
890  int x_height_alphas = 0;
891  int cap_height_alphas = 0;
892  top_bottom_set_ = false;
893  for (UNICHAR_ID id = 0; id < size_used; ++id) {
894  int min_bottom = 0;
895  int max_bottom = UINT8_MAX;
896  int min_top = 0;
897  int max_top = UINT8_MAX;
898  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
899  if (min_top > 0)
900  top_bottom_set_ = true;
901  if (get_isalpha(id)) {
902  if (get_islower(id) || get_isupper(id))
903  ++net_case_alphas;
904  else
905  --net_case_alphas;
906  if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
907  ++x_height_alphas;
908  else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
909  ++cap_height_alphas;
910  }
911  set_normed_ids(id);
912  }
913 
914  script_has_upper_lower_ = net_case_alphas > 0;
915  script_has_xheight_ = script_has_upper_lower_ ||
916  (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
917  cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
918 
919  null_sid_ = get_script_id_from_name(null_script);
920  ASSERT_HOST(null_sid_ == 0);
921  common_sid_ = get_script_id_from_name("Common");
922  latin_sid_ = get_script_id_from_name("Latin");
923  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
924  greek_sid_ = get_script_id_from_name("Greek");
925  han_sid_ = get_script_id_from_name("Han");
926  hiragana_sid_ = get_script_id_from_name("Hiragana");
927  katakana_sid_ = get_script_id_from_name("Katakana");
928  thai_sid_ = get_script_id_from_name("Thai");
929  hangul_sid_ = get_script_id_from_name("Hangul");
930 
931  // Compute default script. Use the highest-counting alpha script, that is
932  // not the common script, as that still contains some "alphas".
933  int* script_counts = new int[script_table_size_used];
934  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
935  for (int id = 0; id < size_used; ++id) {
936  if (get_isalpha(id)) {
937  ++script_counts[get_script(id)];
938  }
939  }
940  default_sid_ = 0;
941  for (int s = 1; s < script_table_size_used; ++s) {
942  if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
943  default_sid_ = s;
944  }
945  delete [] script_counts;
946 }
947 
948 // Returns true if right_to_left scripts are significant in the unicharset,
949 // but without being so sensitive that "universal" unicharsets containing
950 // characters from many scripts, like orientation and script detection,
951 // look like they are right_to_left.
953  int ltr_count = 0;
954  int rtl_count = 0;
955  for (int id = 0; id < size_used; ++id) {
956  int dir = get_direction(id);
957  if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
958  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
960  dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
961  }
962  return rtl_count > ltr_count;
963 }
964 
965 // Set a whitelist and/or blacklist of characters to recognize.
966 // An empty or nullptr whitelist enables everything (minus any blacklist).
967 // An empty or nullptr blacklist disables nothing.
968 // An empty or nullptr blacklist has no effect.
969 void UNICHARSET::set_black_and_whitelist(const char* blacklist,
970  const char* whitelist,
971  const char* unblacklist) {
972  bool def_enabled = whitelist == nullptr || whitelist[0] == '\0';
973  // Set everything to default
974  for (int ch = 0; ch < size_used; ++ch)
975  unichars[ch].properties.enabled = def_enabled;
976  if (!def_enabled) {
977  // Enable the whitelist.
978  GenericVector<UNICHAR_ID> encoding;
979  encode_string(whitelist, false, &encoding, nullptr, nullptr);
980  for (int i = 0; i < encoding.size(); ++i) {
981  if (encoding[i] != INVALID_UNICHAR_ID)
982  unichars[encoding[i]].properties.enabled = true;
983  }
984  }
985  if (blacklist != nullptr && blacklist[0] != '\0') {
986  // Disable the blacklist.
987  GenericVector<UNICHAR_ID> encoding;
988  encode_string(blacklist, false, &encoding, nullptr, nullptr);
989  for (int i = 0; i < encoding.size(); ++i) {
990  if (encoding[i] != INVALID_UNICHAR_ID)
991  unichars[encoding[i]].properties.enabled = false;
992  }
993  }
994  if (unblacklist != nullptr && unblacklist[0] != '\0') {
995  // Re-enable the unblacklist.
996  GenericVector<UNICHAR_ID> encoding;
997  encode_string(unblacklist, false, &encoding, nullptr, nullptr);
998  for (int i = 0; i < encoding.size(); ++i) {
999  if (encoding[i] != INVALID_UNICHAR_ID)
1000  unichars[encoding[i]].properties.enabled = true;
1001  }
1002  }
1003 }
1004 
1005 // Returns true if there are any repeated unicodes in the normalized
1006 // text of any unichar-id in the unicharset.
1008  int start_id = 0;
1010  for (int id = start_id; id < size_used; ++id) {
1011  // Convert to unicodes.
1012  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
1013  for (size_t u = 1; u < unicodes.size(); ++u) {
1014  if (unicodes[u - 1] == unicodes[u]) return true;
1015  }
1016  }
1017  return false;
1018 }
1019 
1020 int UNICHARSET::add_script(const char* script) {
1021  for (int i = 0; i < script_table_size_used; ++i) {
1022  if (strcmp(script, script_table[i]) == 0)
1023  return i;
1024  }
1025  if (script_table_size_reserved == 0) {
1026  script_table_size_reserved = 8;
1027  script_table = new char*[script_table_size_reserved];
1028  } else if (script_table_size_used >= script_table_size_reserved) {
1029  assert(script_table_size_used == script_table_size_reserved);
1030  script_table_size_reserved += script_table_size_reserved;
1031  char** new_script_table = new char*[script_table_size_reserved];
1032  memcpy(new_script_table, script_table,
1033  script_table_size_used * sizeof(char*));
1034  delete[] script_table;
1035  script_table = new_script_table;
1036  }
1037  script_table[script_table_size_used] = new char[strlen(script) + 1];
1038  strcpy(script_table[script_table_size_used], script);
1039  return script_table_size_used++;
1040 }
1041 
1042 // Returns the string that represents a fragment
1043 // with the given unichar, pos and total.
1044 STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
1045  bool natural) {
1046  if (total == 1) return STRING(unichar);
1047  STRING result = "";
1048  result += kSeparator;
1049  result += unichar;
1050  char buffer[kMaxLen];
1051  snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
1052  natural ? kNaturalFlag : kSeparator, total);
1053  result += buffer;
1054  return result;
1055 }
1056 
1058  const char *ptr = string;
1059  int len = strlen(string);
1060  if (len < kMinLen || *ptr != kSeparator) {
1061  return nullptr; // this string can not represent a fragment
1062  }
1063  ptr++; // move to the next character
1064  int step = 0;
1065  while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
1066  step += UNICHAR::utf8_step(ptr + step);
1067  }
1068  if (step == 0 || step > UNICHAR_LEN) {
1069  return nullptr; // no character for unichar or the character is too long
1070  }
1071  char unichar[UNICHAR_LEN + 1];
1072  strncpy(unichar, ptr, step);
1073  unichar[step] = '\0'; // null terminate unichar
1074  ptr += step; // move to the next fragment separator
1075  int pos = 0;
1076  int total = 0;
1077  bool natural = false;
1078  char *end_ptr = nullptr;
1079  for (int i = 0; i < 2; i++) {
1080  if (ptr > string + len || *ptr != kSeparator) {
1081  if (i == 1 && *ptr == kNaturalFlag)
1082  natural = true;
1083  else
1084  return nullptr; // Failed to parse fragment representation.
1085  }
1086  ptr++; // move to the next character
1087  i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
1088  : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
1089  ptr = end_ptr;
1090  }
1091  if (ptr != string + len) {
1092  return nullptr; // malformed fragment representation
1093  }
1094  auto *fragment = new CHAR_FRAGMENT();
1095  fragment->set_all(unichar, pos, total, natural);
1096  return fragment;
1097 }
1098 
1099 int UNICHARSET::get_script_id_from_name(const char* script_name) const {
1100  for (int i = 0; i < script_table_size_used; ++i) {
1101  if (strcmp(script_name, script_table[i]) == 0)
1102  return i;
1103  }
1104  return 0; // 0 is always the null_script
1105 }
1106 
1107 // Removes/replaces content that belongs in rendered text, but not in the
1108 // unicharset.
1109 /* static */
1110 std::string UNICHARSET::CleanupString(const char* utf8_str, size_t length) {
1111  std::string result;
1112  result.reserve(length);
1113  char ch;
1114  while ((ch = *utf8_str) != '\0' && length-- > 0) {
1115  int key_index = 0;
1116  const char* key;
1117  while ((key = kCleanupMaps[key_index][0]) != nullptr) {
1118  int match = 0;
1119  while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
1120  if (key[match] == '\0') {
1121  utf8_str += match;
1122  break;
1123  }
1124  ++key_index;
1125  }
1126  if (key == nullptr) {
1127  result.push_back(ch);
1128  ++utf8_str;
1129  } else {
1130  result.append(kCleanupMaps[key_index][1]);
1131  }
1132  }
1133  return result;
1134 }
UNICHARSET::load_from_file
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:378
string
std::string string
Definition: equationdetect_test.cc:21
UNICHARSET::get_direction
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:680
UNICHARSET::set_isupper
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:431
UNICHARSET::UNICHARSET
UNICHARSET()
Definition: unicharset.cpp:175
UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:488
UNICHARSET::AppendOtherUnicharset
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:463
LocalFilePointer::fgets
char * fgets(char *dst, int size)
Definition: unicharset.cpp:735
UNICHARSET::encode_string
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:258
UNICHARSET::set_islower
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:426
UNICHARSET::id_to_unichar_ext
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:298
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:481
UNICHARSET::get_script_id_from_name
int get_script_id_from_name(const char *script_name) const
Definition: unicharset.cpp:1099
UNICHARSET::encodable_string
bool encodable_string(const char *str, int *first_bad_position) const
Definition: unicharset.cpp:243
CHAR_FRAGMENT::to_string
static STRING to_string(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.cpp:1044
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
UNICHARSET::~UNICHARSET
~UNICHARSET()
Definition: unicharset.cpp:190
UNICHARMAP::contains
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:79
params.h
OldUncleanUnichars::kTrue
UNICHARSET::set_normed
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:472
CHAR_FRAGMENT::get_unichar
const char * get_unichar() const
Definition: unicharset.h:70
UNICHARSET::set_direction
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:462
UNICHARSET::get_advance_stats
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:620
STRING
Definition: strngs.h:45
UNICHARSET::get_script_from_script_id
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:844
LocalFilePointer
Definition: unicharset.cpp:732
UNICHARSET::U_LEFT_TO_RIGHT
Definition: unicharset.h:157
UNICHARSET::set_normed_ids
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:372
UNICHARSET::set_top_bottom
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:572
UNICHARSET::eq
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:686
UNICHARSET::get_width_stats
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:586
UNICHARSET::step
int step(const char *str) const
Definition: unicharset.cpp:232
UNICHARSET::get_normed_unichar
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:818
UNICHARSET::set_bearing_stats
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:613
tesseract::UNICHAR
Definition: unichar.h:59
UNICHARSET::clear
void clear()
Definition: unicharset.h:306
UNICHARSET::set_isalpha
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:421
UNICHARSET::get_script
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:653
UNICHARSET::kCustomLigatures
static const TESS_API char * kCustomLigatures[][2]
Definition: unicharset.h:150
UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:509
UNICHARSET::set_width_stats
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:597
OldUncleanUnichars
OldUncleanUnichars
Definition: unicharset.h:43
CHAR_FRAGMENT::parse_from_string
static CHAR_FRAGMENT * parse_from_string(const char *str)
Definition: unicharset.cpp:1057
UNICHARSET::major_right_to_left
bool major_right_to_left() const
Definition: unicharset.cpp:952
UNICHARMAP::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:34
UNICHARSET::add_script
int add_script(const char *script)
Definition: unicharset.cpp:1020
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
UNICHARSET::get_properties
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:601
UNICHARSET::debug_str
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
UNICHARSET::set_advance_stats
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:630
UNICHARSET::set_black_and_whitelist
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
Definition: unicharset.cpp:969
UNICHARSET::AnyRepeatedUnicodes
bool AnyRepeatedUnicodes() const
Definition: unicharset.cpp:1007
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
unicharset.h
UNICHARSET::get_top_bottom
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:558
file
Definition: include_gunit.h:22
UNICHARSET::reserve
void reserve(int unichars_number)
Definition: unicharset.cpp:194
tesstrain_utils.int
int
Definition: tesstrain_utils.py:154
tesseract::char32
signed int char32
Definition: unichar.h:53
UNICHARSET::set_ranges_empty
void set_ranges_empty()
Definition: unicharset.cpp:395
UNICHARSET::get_isprivate
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:387
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
UNICHAR_SPACE
Definition: unicharset.h:34
tesseract::UNICHAR::first_uni
int first_uni() const
Definition: unichar.cpp:98
UNICHARMAP::minmatch
int minmatch(const char *const unichar_repr) const
Definition: unicharmap.cpp:100
tesseract::TFile
Definition: serialis.h:75
UNICHARSET::save_to_string
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:691
UNICHARSET::U_ARABIC_NUMBER
Definition: unicharset.h:162
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
UNICHARSET
Definition: unicharset.h:145
UNICHARSET::get_chartype
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:616
UNICHARSET::set_mirror
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:467
UNICHARSET::get_mirror
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:687
UNICHARSET::set_script
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:452
UNICHARSET::CleanupString
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:38
UNICHARSET::set_ispunctuation
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:441
CHAR_FRAGMENT::get_total
int get_total() const
Definition: unicharset.h:72
CHAR_FRAGMENT::to_string
STRING to_string() const
Definition: unicharset.h:79
UNICHARSET::unichar_insert_backwards_compatible
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:269
UNICHARSET::has_special_codes
bool has_special_codes() const
Definition: unicharset.h:712
UNICHARSET::post_load_setup
void post_load_setup()
Definition: unicharset.cpp:886
UNICHARSET::kSpecialUnicharCodes
static const TESS_API char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:153
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
GenericVector< UNICHAR_ID >
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:495
CHAR_FRAGMENT
Definition: unicharset.h:48
UNICHAR_LEN
#define UNICHAR_LEN
Definition: unichar.h:32
UNICHARSET::PartialSetPropertiesFromOther
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:404
kMinXHeightFraction
const double kMinXHeightFraction
Definition: unicharset.cpp:58
UNICHARSET::set_isngram
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:446
GenericVector::truncate
void truncate(int size)
Definition: genericvector.h:132
unichar.h
kMinCapHeightFraction
const double kMinCapHeightFraction
Definition: unicharset.cpp:59
UNICHARSET::contains_unichar
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
tesseract::TFile::FGets
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:262
UNICHARSET::U_RIGHT_TO_LEFT_ARABIC
Definition: unicharset.h:170
UNICHARSET::get_bearing_stats
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:603
UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:724
UNICHARSET::SizesDistinct
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:485
serialis.h
LocalFilePointer::LocalFilePointer
LocalFilePointer(FILE *stream)
Definition: unicharset.cpp:734
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
UNICHARSET::get_other_case
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:673
UNICHARMAP::insert
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:56
UpdateRange
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:118
UNICHARSET::debug_utf8_str
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:318
UNICHARSET::set_isdigit
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:436
GenericVector::size
int size() const
Definition: genericvector.h:71
UNICHARSET::set_other_case
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:457
UNICHARSET::unichar_insert
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:625
UNICHARSET::size
int size() const
Definition: unicharset.h:341
UNICHARSET::ExpandRangesFromOther
void ExpandRangesFromOther(const UNICHARSET &src)
Definition: unicharset.cpp:434
UNICHARSET::CopyFrom
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:447
UNICHARSET::U_RIGHT_TO_LEFT
Definition: unicharset.h:158
UNICHAR_JOINED
Definition: unicharset.h:35