tesseract  5.0.0-alpha-619-ge9db
unicharset.h
Go to the documentation of this file.
1 // File: unicharset.h
3 // Description: Unicode character/ligature set class.
4 // Author: Thomas Kielbus
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_CCUTIL_UNICHARSET_H_
20 #define TESSERACT_CCUTIL_UNICHARSET_H_
21 
22 #include <functional> // for std::function
23 #include "errcode.h"
25 #include <tesseract/helpers.h>
26 #include <tesseract/serialis.h>
27 #include <tesseract/strngs.h>
28 #include <tesseract/unichar.h>
29 #include "unicharmap.h"
30 
31 // Enum holding special values of unichar_id. Every unicharset has these.
32 // Warning! Keep in sync with kSpecialUnicharCodes.
37 
39 };
40 
41 // Boolean flag for unichar_insert. It's a bit of a double negative to allow
42 // the default value to be false.
43 enum class OldUncleanUnichars {
44  kFalse,
45  kTrue,
46 };
47 
49  public:
50  // Minimum number of characters used for fragment representation.
51  static const int kMinLen = 6;
52  // Maximum number of characters used for fragment representation.
53  static const int kMaxLen = 3 + UNICHAR_LEN + 2;
54  // Maximum number of fragments per character.
55  static const int kMaxChunks = 5;
56 
57  // Setters and Getters.
58  inline void set_all(const char *unichar, int pos, int total, bool natural) {
59  set_unichar(unichar);
60  set_pos(pos);
61  set_total(total);
62  set_natural(natural);
63  }
64  inline void set_unichar(const char *uch) {
65  strncpy(this->unichar, uch, sizeof(this->unichar));
66  this->unichar[UNICHAR_LEN] = '\0';
67  }
68  inline void set_pos(int p) { this->pos = p; }
69  inline void set_total(int t) { this->total = t; }
70  inline const char* get_unichar() const { return this->unichar; }
71  inline int get_pos() const { return this->pos; }
72  inline int get_total() const { return this->total; }
73 
74  // Returns the string that represents a fragment
75  // with the given unichar, pos and total.
76  static STRING to_string(const char *unichar, int pos, int total,
77  bool natural);
78  // Returns the string that represents this fragment.
79  STRING to_string() const {
80  return to_string(unichar, pos, total, natural);
81  }
82 
83  // Checks whether a fragment has the same unichar,
84  // position and total as the given inputs.
85  inline bool equals(const char *other_unichar,
86  int other_pos, int other_total) const {
87  return (strcmp(this->unichar, other_unichar) == 0 &&
88  this->pos == other_pos && this->total == other_total);
89  }
90  inline bool equals(const CHAR_FRAGMENT *other) const {
91  return this->equals(other->get_unichar(),
92  other->get_pos(),
93  other->get_total());
94  }
95 
96  // Checks whether a given fragment is a continuation of this fragment.
97  // Assumes that the given fragment pointer is not nullptr.
98  inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
99  return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
100  this->total == fragment->get_total() &&
101  this->pos == fragment->get_pos() + 1);
102  }
103 
104  // Returns true if this fragment is a beginning fragment.
105  inline bool is_beginning() const { return this->pos == 0; }
106 
107  // Returns true if this fragment is an ending fragment.
108  inline bool is_ending() const { return this->pos == this->total-1; }
109 
110  // Returns true if the fragment was a separate component to begin with,
111  // ie did not need chopping to be isolated, but may have been separated
112  // out from a multi-outline blob.
113  inline bool is_natural() const { return natural; }
114  void set_natural(bool value) { natural = value; }
115 
116  // Parses the string to see whether it represents a character fragment
117  // (rather than a regular character). If so, allocates memory for a new
118  // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
119  // information. Fragments are of the form:
120  // |m|1|2, meaning chunk 1 of 2 of character m, or
121  // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
122  // to divide the parts, as they were already separate connected components.
123  //
124  // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
125  // instance, otherwise (if the string does not represent a fragment or it
126  // looks like it does, but parsing it as a fragment fails) returns nullptr.
127  //
128  // Note: The caller is responsible for deallocating memory
129  // associated with the returned pointer.
130  static CHAR_FRAGMENT *parse_from_string(const char *str);
131 
132  private:
133  char unichar[UNICHAR_LEN + 1];
134  // True if the fragment was a separate component to begin with,
135  // ie did not need chopping to be isolated, but may have been separated
136  // out from a multi-outline blob.
137  bool natural;
138  int16_t pos; // fragment position in the character
139  int16_t total; // total number of fragments in the character
140 };
141 
142 // The UNICHARSET class is an utility class for Tesseract that holds the
143 // set of characters that are used by the engine. Each character is identified
144 // by a unique number, from 0 to (size - 1).
145 class UNICHARSET {
146  public:
147  // Custom list of characters and their ligature forms (UTF8)
148  // These map to unicode values in the private use area (PUC) and are supported
149  // by only few font families (eg. Wyld, Adobe Caslon Pro).
150  static TESS_API const char* kCustomLigatures[][2];
151 
152  // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
154 
155  // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h)
156  enum Direction {
180 #ifndef U_HIDE_DEPRECATED_API
182 #endif // U_HIDE_DEPRECATED_API
183  };
184 
185  // Create an empty UNICHARSET
186  UNICHARSET();
187 
188  ~UNICHARSET();
189 
190  // Return the UNICHAR_ID of a given unichar representation within the
191  // UNICHARSET.
192  UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
193 
194  // Return the UNICHAR_ID of a given unichar representation within the
195  // UNICHARSET. Only the first length characters from unichar_repr are used.
196  UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
197 
198  // Return the minimum number of bytes that matches a legal UNICHAR_ID,
199  // while leaving the rest of the string encodable. Returns 0 if the
200  // beginning of the string is not encodable.
201  // WARNING: this function now encodes the whole string for precision.
202  // Use encode_string in preference to repeatedly calling step.
203  int step(const char* str) const;
204 
205  // Returns true if the given UTF-8 string is encodable with this UNICHARSET.
206  // If not encodable, write the first byte offset which cannot be converted
207  // into the second (return) argument.
208  bool encodable_string(const char *str, int *first_bad_position) const;
209 
210  // Encodes the given UTF-8 string with this UNICHARSET.
211  // Any part of the string that cannot be encoded (because the utf8 can't
212  // be broken up into pieces that are in the unicharset) then:
213  // if give_up_on_failure, stops and returns a partial encoding,
214  // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.
215  // Returns true if the encoding succeeds completely, false if there is at
216  // least one failure.
217  // If lengths is not nullptr, then it is filled with the corresponding
218  // byte length of each encoded UNICHAR_ID.
219  // If encoded_length is not nullptr then on return it contains the length of
220  // str that was encoded. (if give_up_on_failure the location of the first
221  // failure, otherwise strlen(str).)
222  // WARNING: Caller must guarantee that str has already been cleaned of codes
223  // that do not belong in the unicharset, or encoding may fail.
224  // Use CleanupString to perform the cleaning.
225  bool encode_string(const char* str, bool give_up_on_failure,
226  GenericVector<UNICHAR_ID>* encoding,
227  GenericVector<char>* lengths,
228  int* encoded_length) const;
229 
230  // Return the unichar representation corresponding to the given UNICHAR_ID
231  // within the UNICHARSET.
232  const char* id_to_unichar(UNICHAR_ID id) const;
233 
234  // Return the UTF8 representation corresponding to the given UNICHAR_ID after
235  // resolving any private encodings internal to Tesseract. This method is
236  // preferable to id_to_unichar for outputting text that will be visible to
237  // external applications.
238  const char* id_to_unichar_ext(UNICHAR_ID id) const;
239 
240  // Return a STRING that reformats the utf8 str into the str followed
241  // by its hex unicodes.
242  static STRING debug_utf8_str(const char* str);
243 
244  // Removes/replaces content that belongs in rendered text, but not in the
245  // unicharset.
246  static std::string CleanupString(const char* utf8_str) {
247  return CleanupString(utf8_str, strlen(utf8_str));
248  }
249  static std::string CleanupString(const char* utf8_str, size_t length);
250 
251  // Return a STRING containing debug information on the unichar, including
252  // the id_to_unichar, its hex unicodes and the properties.
253  STRING debug_str(UNICHAR_ID id) const;
254  STRING debug_str(const char * unichar_repr) const {
255  return debug_str(unichar_to_id(unichar_repr));
256  }
257 
258  // Adds a unichar representation to the set. If old_style is true, then
259  // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
260  // characters are ignored/skipped as if they don't exist and n-grams that
261  // can already be encoded are not added.
262  void unichar_insert(const char* const unichar_repr,
263  OldUncleanUnichars old_style);
264  void unichar_insert(const char* const unichar_repr) {
266  }
267  // Adds a unichar representation to the set. Avoids setting old_style to true,
268  // unless it is necessary to make the new unichar get added.
269  void unichar_insert_backwards_compatible(const char* const unichar_repr) {
270  std::string cleaned = CleanupString(unichar_repr);
271  if (cleaned != unichar_repr) {
273  } else {
274  int old_size = size();
276  if (size() == old_size) {
278  }
279  }
280  }
281 
282  // Return true if the given unichar id exists within the set.
283  // Relies on the fact that unichar ids are contiguous in the unicharset.
284  bool contains_unichar_id(UNICHAR_ID unichar_id) const {
285  return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
286  unichar_id >= 0;
287  }
288 
289  // Return true if the given unichar representation exists within the set.
290  bool contains_unichar(const char* const unichar_repr) const;
291  bool contains_unichar(const char* const unichar_repr, int length) const;
292 
293  // Return true if the given unichar representation corresponds to the given
294  // UNICHAR_ID within the set.
295  bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
296 
297  // Delete CHAR_FRAGMENTs stored in properties of unichars array.
299  for (int i = 0; i < size_used; ++i) {
300  delete unichars[i].properties.fragment;
301  unichars[i].properties.fragment = nullptr;
302  }
303  }
304 
305  // Clear the UNICHARSET (all the previous data is lost).
306  void clear() {
307  if (script_table != nullptr) {
308  for (int i = 0; i < script_table_size_used; ++i)
309  delete[] script_table[i];
310  delete[] script_table;
311  script_table = nullptr;
312  script_table_size_used = 0;
313  }
314  if (unichars != nullptr) {
316  delete[] unichars;
317  unichars = nullptr;
318  }
319  script_table_size_reserved = 0;
320  size_reserved = 0;
321  size_used = 0;
322  ids.clear();
323  top_bottom_set_ = false;
324  script_has_upper_lower_ = false;
325  script_has_xheight_ = false;
326  old_style_included_ = false;
327  null_sid_ = 0;
328  common_sid_ = 0;
329  latin_sid_ = 0;
330  cyrillic_sid_ = 0;
331  greek_sid_ = 0;
332  han_sid_ = 0;
333  hiragana_sid_ = 0;
334  katakana_sid_ = 0;
335  thai_sid_ = 0;
336  hangul_sid_ = 0;
337  default_sid_ = 0;
338  }
339 
340  // Return the size of the set (the number of different UNICHAR it holds).
341  int size() const {
342  return size_used;
343  }
344 
345  // Reserve enough memory space for the given number of UNICHARS
346  void reserve(int unichars_number);
347 
348  // Opens the file indicated by filename and saves unicharset to that file.
349  // Returns true if the operation is successful.
350  bool save_to_file(const char * const filename) const {
351  FILE* file = fopen(filename, "w+b");
352  if (file == nullptr) return false;
353  bool result = save_to_file(file);
354  fclose(file);
355  return result;
356  }
357 
358  // Saves the content of the UNICHARSET to the given file.
359  // Returns true if the operation is successful.
360  bool save_to_file(FILE *file) const {
361  STRING str;
362  return save_to_string(&str) &&
363  tesseract::Serialize(file, &str[0], str.length());
364  }
365 
367  STRING str;
368  return save_to_string(&str) && file->Serialize(&str[0], str.length());
369  }
370 
371  // Saves the content of the UNICHARSET to the given STRING.
372  // Returns true if the operation is successful.
373  bool save_to_string(STRING *str) const;
374 
375  // Opens the file indicated by filename and loads the UNICHARSET
376  // from the given file. The previous data is lost.
377  // Returns true if the operation is successful.
378  bool load_from_file(const char* const filename, bool skip_fragments) {
379  FILE* file = fopen(filename, "rb");
380  if (file == nullptr) return false;
381  bool result = load_from_file(file, skip_fragments);
382  fclose(file);
383  return result;
384  }
385  // returns true if the operation is successful.
386  bool load_from_file(const char* const filename) {
387  return load_from_file(filename, false);
388  }
389 
390  // Loads the UNICHARSET from the given file. The previous data is lost.
391  // Returns true if the operation is successful.
392  bool load_from_file(FILE *file, bool skip_fragments);
393  bool load_from_file(FILE *file) { return load_from_file(file, false); }
394  bool load_from_file(tesseract::TFile *file, bool skip_fragments);
395 
396 
397  // Sets up internal data after loading the file, based on the char
398  // properties. Called from load_from_file, but also needs to be run
399  // during set_unicharset_properties.
400  void post_load_setup();
401 
402  // Returns true if right_to_left scripts are significant in the unicharset,
403  // but without being so sensitive that "universal" unicharsets containing
404  // characters from many scripts, like orientation and script detection,
405  // look like they are right_to_left.
406  bool major_right_to_left() const;
407 
408  // Set a whitelist and/or blacklist of characters to recognize.
409  // An empty or nullptr whitelist enables everything (minus any blacklist).
410  // An empty or nullptr blacklist disables nothing.
411  // An empty or nullptr unblacklist has no effect.
412  // The blacklist overrides the whitelist.
413  // The unblacklist overrides the blacklist.
414  // Each list is a string of utf8 character strings. Boundaries between
415  // unicharset units are worked out automatically, and characters not in
416  // the unicharset are silently ignored.
417  void set_black_and_whitelist(const char* blacklist, const char* whitelist,
418  const char* unblacklist);
419 
420  // Set the isalpha property of the given unichar to the given value.
421  void set_isalpha(UNICHAR_ID unichar_id, bool value) {
422  unichars[unichar_id].properties.isalpha = value;
423  }
424 
425  // Set the islower property of the given unichar to the given value.
426  void set_islower(UNICHAR_ID unichar_id, bool value) {
427  unichars[unichar_id].properties.islower = value;
428  }
429 
430  // Set the isupper property of the given unichar to the given value.
431  void set_isupper(UNICHAR_ID unichar_id, bool value) {
432  unichars[unichar_id].properties.isupper = value;
433  }
434 
435  // Set the isdigit property of the given unichar to the given value.
436  void set_isdigit(UNICHAR_ID unichar_id, bool value) {
437  unichars[unichar_id].properties.isdigit = value;
438  }
439 
440  // Set the ispunctuation property of the given unichar to the given value.
441  void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
442  unichars[unichar_id].properties.ispunctuation = value;
443  }
444 
445  // Set the isngram property of the given unichar to the given value.
446  void set_isngram(UNICHAR_ID unichar_id, bool value) {
447  unichars[unichar_id].properties.isngram = value;
448  }
449 
450  // Set the script name of the given unichar to the given value.
451  // Value is copied and thus can be a temporary;
452  void set_script(UNICHAR_ID unichar_id, const char* value) {
453  unichars[unichar_id].properties.script_id = add_script(value);
454  }
455 
456  // Set other_case unichar id in the properties for the given unichar id.
457  void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
458  unichars[unichar_id].properties.other_case = other_case;
459  }
460 
461  // Set the direction property of the given unichar to the given value.
463  unichars[unichar_id].properties.direction = value;
464  }
465 
466  // Set mirror unichar id in the properties for the given unichar id.
467  void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
468  unichars[unichar_id].properties.mirror = mirror;
469  }
470 
471  // Record normalized version of unichar with the given unichar_id.
472  void set_normed(UNICHAR_ID unichar_id, const char* normed) {
473  unichars[unichar_id].properties.normed = normed;
474  unichars[unichar_id].properties.normed_ids.truncate(0);
475  }
476  // Sets the normed_ids vector from the normed string. normed_ids is not
477  // stored in the file, and needs to be set when the UNICHARSET is loaded.
478  void set_normed_ids(UNICHAR_ID unichar_id);
479 
480  // Return the isalpha property of the given unichar.
481  bool get_isalpha(UNICHAR_ID unichar_id) const {
482  if (INVALID_UNICHAR_ID == unichar_id) return false;
483  ASSERT_HOST(contains_unichar_id(unichar_id));
484  return unichars[unichar_id].properties.isalpha;
485  }
486 
487  // Return the islower property of the given unichar.
488  bool get_islower(UNICHAR_ID unichar_id) const {
489  if (INVALID_UNICHAR_ID == unichar_id) return false;
490  ASSERT_HOST(contains_unichar_id(unichar_id));
491  return unichars[unichar_id].properties.islower;
492  }
493 
494  // Return the isupper property of the given unichar.
495  bool get_isupper(UNICHAR_ID unichar_id) const {
496  if (INVALID_UNICHAR_ID == unichar_id) return false;
497  ASSERT_HOST(contains_unichar_id(unichar_id));
498  return unichars[unichar_id].properties.isupper;
499  }
500 
501  // Return the isdigit property of the given unichar.
502  bool get_isdigit(UNICHAR_ID unichar_id) const {
503  if (INVALID_UNICHAR_ID == unichar_id) return false;
504  ASSERT_HOST(contains_unichar_id(unichar_id));
505  return unichars[unichar_id].properties.isdigit;
506  }
507 
508  // Return the ispunctuation property of the given unichar.
509  bool get_ispunctuation(UNICHAR_ID unichar_id) const {
510  if (INVALID_UNICHAR_ID == unichar_id) return false;
511  ASSERT_HOST(contains_unichar_id(unichar_id));
512  return unichars[unichar_id].properties.ispunctuation;
513  }
514 
515  // Return the isngram property of the given unichar.
516  bool get_isngram(UNICHAR_ID unichar_id) const {
517  if (INVALID_UNICHAR_ID == unichar_id) return false;
518  ASSERT_HOST(contains_unichar_id(unichar_id));
519  return unichars[unichar_id].properties.isngram;
520  }
521 
522  // Returns whether the unichar id represents a unicode value in the private
523  // use area.
524  bool get_isprivate(UNICHAR_ID unichar_id) const;
525 
526  // Returns true if the ids have useful min/max top/bottom values.
527  bool top_bottom_useful() const {
528  return top_bottom_set_;
529  }
530  // Sets all ranges to empty, so they can be expanded to set the values.
531  void set_ranges_empty();
532  // Sets all the properties for this unicharset given a src_unicharset with
533  // everything set. The unicharsets don't have to be the same, and graphemes
534  // are correctly accounted for.
537  }
538  // Sets properties from Other, starting only at the given index.
539  void PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src);
540  // Expands the tops and bottoms and widths for this unicharset given a
541  // src_unicharset with ranges in it. The unicharsets don't have to be the
542  // same, and graphemes are correctly accounted for.
543  void ExpandRangesFromOther(const UNICHARSET& src);
544  // Makes this a copy of src. Clears this completely first, so the automattic
545  // ids will not be present in this if not in src.
546  void CopyFrom(const UNICHARSET& src);
547  // For each id in src, if it does not occur in this, add it, as in
548  // SetPropertiesFromOther, otherwise expand the ranges, as in
549  // ExpandRangesFromOther.
550  void AppendOtherUnicharset(const UNICHARSET& src);
551  // Returns true if the acceptable ranges of the tops of the characters do
552  // not overlap, making their x-height calculations distinct.
553  bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;
554  // Returns the min and max bottom and top of the given unichar in
555  // baseline-normalized coordinates, ie, where the baseline is
556  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
557  // (See normalis.h for the definitions).
558  void get_top_bottom(UNICHAR_ID unichar_id,
559  int* min_bottom, int* max_bottom,
560  int* min_top, int* max_top) const {
561  if (INVALID_UNICHAR_ID == unichar_id) {
562  *min_bottom = *min_top = 0;
563  *max_bottom = *max_top = 256; // kBlnCellHeight
564  return;
565  }
566  ASSERT_HOST(contains_unichar_id(unichar_id));
567  *min_bottom = unichars[unichar_id].properties.min_bottom;
568  *max_bottom = unichars[unichar_id].properties.max_bottom;
569  *min_top = unichars[unichar_id].properties.min_top;
570  *max_top = unichars[unichar_id].properties.max_top;
571  }
572  void set_top_bottom(UNICHAR_ID unichar_id,
573  int min_bottom, int max_bottom,
574  int min_top, int max_top) {
575  unichars[unichar_id].properties.min_bottom =
576  ClipToRange<int>(min_bottom, 0, UINT8_MAX);
577  unichars[unichar_id].properties.max_bottom =
578  ClipToRange<int>(max_bottom, 0, UINT8_MAX);
579  unichars[unichar_id].properties.min_top =
580  ClipToRange<int>(min_top, 0, UINT8_MAX);
581  unichars[unichar_id].properties.max_top =
582  ClipToRange<int>(max_top, 0, UINT8_MAX);
583  }
584  // Returns the width stats (as mean, sd) of the given unichar relative to the
585  // median advance of all characters in the character set.
586  void get_width_stats(UNICHAR_ID unichar_id,
587  float* width, float* width_sd) const {
588  if (INVALID_UNICHAR_ID == unichar_id) {
589  *width = 0.0f;
590  *width_sd = 0.0f;;
591  return;
592  }
593  ASSERT_HOST(contains_unichar_id(unichar_id));
594  *width = unichars[unichar_id].properties.width;
595  *width_sd = unichars[unichar_id].properties.width_sd;
596  }
597  void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {
598  unichars[unichar_id].properties.width = width;
599  unichars[unichar_id].properties.width_sd = width_sd;
600  }
601  // Returns the stats of the x-bearing (as mean, sd) of the given unichar
602  // relative to the median advance of all characters in the character set.
603  void get_bearing_stats(UNICHAR_ID unichar_id,
604  float* bearing, float* bearing_sd) const {
605  if (INVALID_UNICHAR_ID == unichar_id) {
606  *bearing = *bearing_sd = 0.0f;
607  return;
608  }
609  ASSERT_HOST(contains_unichar_id(unichar_id));
610  *bearing = unichars[unichar_id].properties.bearing;
611  *bearing_sd = unichars[unichar_id].properties.bearing_sd;
612  }
613  void set_bearing_stats(UNICHAR_ID unichar_id,
614  float bearing, float bearing_sd) {
615  unichars[unichar_id].properties.bearing = bearing;
616  unichars[unichar_id].properties.bearing_sd = bearing_sd;
617  }
618  // Returns the stats of the x-advance of the given unichar (as mean, sd)
619  // relative to the median advance of all characters in the character set.
620  void get_advance_stats(UNICHAR_ID unichar_id,
621  float* advance, float* advance_sd) const {
622  if (INVALID_UNICHAR_ID == unichar_id) {
623  *advance = *advance_sd = 0;
624  return;
625  }
626  ASSERT_HOST(contains_unichar_id(unichar_id));
627  *advance = unichars[unichar_id].properties.advance;
628  *advance_sd = unichars[unichar_id].properties.advance_sd;
629  }
630  void set_advance_stats(UNICHAR_ID unichar_id,
631  float advance, float advance_sd) {
632  unichars[unichar_id].properties.advance = advance;
633  unichars[unichar_id].properties.advance_sd = advance_sd;
634  }
635  // Returns true if the font metrics properties are empty.
636  bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {
637  return unichars[unichar_id].properties.AnyRangeEmpty();
638  }
639 
640  // Returns true if the script of the given id is space delimited.
641  // Returns false for Han and Thai scripts.
642  bool IsSpaceDelimited(UNICHAR_ID unichar_id) const {
643  if (INVALID_UNICHAR_ID == unichar_id) return true;
644  int script_id = get_script(unichar_id);
645  return script_id != han_sid_ && script_id != thai_sid_ &&
646  script_id != hangul_sid_ && script_id != hiragana_sid_ &&
647  script_id != katakana_sid_;
648  }
649 
650  // Return the script name of the given unichar.
651  // The returned pointer will always be the same for the same script, it's
652  // managed by unicharset and thus MUST NOT be deleted
653  int get_script(UNICHAR_ID unichar_id) const {
654  if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
655  ASSERT_HOST(contains_unichar_id(unichar_id));
656  return unichars[unichar_id].properties.script_id;
657  }
658 
659  // Return the character properties, eg. alpha/upper/lower/digit/punct,
660  // as a bit field of unsigned int.
661  unsigned int get_properties(UNICHAR_ID unichar_id) const;
662 
663  // Return the character property as a single char. If a character has
664  // multiple attributes, the main property is defined by the following order:
665  // upper_case : 'A'
666  // lower_case : 'a'
667  // alpha : 'x'
668  // digit : '0'
669  // punctuation: 'p'
670  char get_chartype(UNICHAR_ID unichar_id) const;
671 
672  // Get other_case unichar id in the properties for the given unichar id.
674  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
675  ASSERT_HOST(contains_unichar_id(unichar_id));
676  return unichars[unichar_id].properties.other_case;
677  }
678 
679  // Returns the direction property of the given unichar.
680  Direction get_direction(UNICHAR_ID unichar_id) const {
681  if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
682  ASSERT_HOST(contains_unichar_id(unichar_id));
683  return unichars[unichar_id].properties.direction;
684  }
685 
686  // Get mirror unichar id in the properties for the given unichar id.
687  UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
688  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
689  ASSERT_HOST(contains_unichar_id(unichar_id));
690  return unichars[unichar_id].properties.mirror;
691  }
692 
693  // Returns UNICHAR_ID of the corresponding lower-case unichar.
694  UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
695  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
696  ASSERT_HOST(contains_unichar_id(unichar_id));
697  if (unichars[unichar_id].properties.islower) return unichar_id;
698  return unichars[unichar_id].properties.other_case;
699  }
700 
701  // Returns UNICHAR_ID of the corresponding upper-case unichar.
702  UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
703  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
704  ASSERT_HOST(contains_unichar_id(unichar_id));
705  if (unichars[unichar_id].properties.isupper) return unichar_id;
706  return unichars[unichar_id].properties.other_case;
707  }
708 
709  // Returns true if this UNICHARSET has the special codes in
710  // SpecialUnicharCodes available. If false then there are normal unichars
711  // at these codes and they should not be used.
712  bool has_special_codes() const {
713  return get_fragment(UNICHAR_BROKEN) != nullptr &&
716  }
717 
718  // Returns true if there are any repeated unicodes in the normalized
719  // text of any unichar-id in the unicharset.
720  bool AnyRepeatedUnicodes() const;
721 
722  // Return a pointer to the CHAR_FRAGMENT class if the given
723  // unichar id represents a character fragment.
724  const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
725  if (INVALID_UNICHAR_ID == unichar_id) return nullptr;
726  ASSERT_HOST(contains_unichar_id(unichar_id));
727  return unichars[unichar_id].properties.fragment;
728  }
729 
730  // Return the isalpha property of the given unichar representation.
731  bool get_isalpha(const char* const unichar_repr) const {
732  return get_isalpha(unichar_to_id(unichar_repr));
733  }
734 
735  // Return the islower property of the given unichar representation.
736  bool get_islower(const char* const unichar_repr) const {
737  return get_islower(unichar_to_id(unichar_repr));
738  }
739 
740  // Return the isupper property of the given unichar representation.
741  bool get_isupper(const char* const unichar_repr) const {
742  return get_isupper(unichar_to_id(unichar_repr));
743  }
744 
745  // Return the isdigit property of the given unichar representation.
746  bool get_isdigit(const char* const unichar_repr) const {
747  return get_isdigit(unichar_to_id(unichar_repr));
748  }
749 
750  // Return the ispunctuation property of the given unichar representation.
751  bool get_ispunctuation(const char* const unichar_repr) const {
752  return get_ispunctuation(unichar_to_id(unichar_repr));
753  }
754 
755  // Return the character properties, eg. alpha/upper/lower/digit/punct,
756  // of the given unichar representation
757  unsigned int get_properties(const char* const unichar_repr) const {
758  return get_properties(unichar_to_id(unichar_repr));
759  }
760 
761  char get_chartype(const char* const unichar_repr) const {
762  return get_chartype(unichar_to_id(unichar_repr));
763  }
764 
765  // Return the script name of the given unichar representation.
766  // The returned pointer will always be the same for the same script, it's
767  // managed by unicharset and thus MUST NOT be deleted
768  int get_script(const char* const unichar_repr) const {
769  return get_script(unichar_to_id(unichar_repr));
770  }
771 
772  // Return a pointer to the CHAR_FRAGMENT class struct if the given
773  // unichar representation represents a character fragment.
774  const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
775  if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
776  !ids.contains(unichar_repr, false)) {
777  return nullptr;
778  }
779  return get_fragment(unichar_to_id(unichar_repr));
780  }
781 
782  // Return the isalpha property of the given unichar representation.
783  // Only the first length characters from unichar_repr are used.
784  bool get_isalpha(const char* const unichar_repr,
785  int length) const {
786  return get_isalpha(unichar_to_id(unichar_repr, length));
787  }
788 
789  // Return the islower property of the given unichar representation.
790  // Only the first length characters from unichar_repr are used.
791  bool get_islower(const char* const unichar_repr,
792  int length) const {
793  return get_islower(unichar_to_id(unichar_repr, length));
794  }
795 
796  // Return the isupper property of the given unichar representation.
797  // Only the first length characters from unichar_repr are used.
798  bool get_isupper(const char* const unichar_repr,
799  int length) const {
800  return get_isupper(unichar_to_id(unichar_repr, length));
801  }
802 
803  // Return the isdigit property of the given unichar representation.
804  // Only the first length characters from unichar_repr are used.
805  bool get_isdigit(const char* const unichar_repr,
806  int length) const {
807  return get_isdigit(unichar_to_id(unichar_repr, length));
808  }
809 
810  // Return the ispunctuation property of the given unichar representation.
811  // Only the first length characters from unichar_repr are used.
812  bool get_ispunctuation(const char* const unichar_repr,
813  int length) const {
814  return get_ispunctuation(unichar_to_id(unichar_repr, length));
815  }
816 
817  // Returns normalized version of unichar with the given unichar_id.
818  const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
819  if (unichar_id == UNICHAR_SPACE) return " ";
820  return unichars[unichar_id].properties.normed.c_str();
821  }
822  // Returns a vector of UNICHAR_IDs that represent the ids of the normalized
823  // version of the given id. There may be more than one UNICHAR_ID in the
824  // vector if unichar_id represents a ligature.
826  return unichars[unichar_id].properties.normed_ids;
827  }
828 
829  // Return the script name of the given unichar representation.
830  // Only the first length characters from unichar_repr are used.
831  // The returned pointer will always be the same for the same script, it's
832  // managed by unicharset and thus MUST NOT be deleted
833  int get_script(const char* const unichar_repr,
834  int length) const {
835  return get_script(unichar_to_id(unichar_repr, length));
836  }
837 
838  // Return the (current) number of scripts in the script table
839  int get_script_table_size() const {
840  return script_table_size_used;
841  }
842 
843  // Return the script string from its id
844  const char* get_script_from_script_id(int id) const {
845  if (id >= script_table_size_used || id < 0)
846  return null_script;
847  return script_table[id];
848  }
849 
850  // Returns the id from the name of the script, or 0 if script is not found.
851  // Note that this is an expensive operation since it involves iteratively
852  // comparing strings in the script table. To avoid dependency on STL, we
853  // won't use a hash. Instead, the calling function can use this to lookup
854  // and save the ID for relevant scripts for fast comparisons later.
855  int get_script_id_from_name(const char* script_name) const;
856 
857  // Return true if the given script is the null script
858  bool is_null_script(const char* script) const {
859  return script == null_script;
860  }
861 
862  // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
863  // then the returned pointer will be the same.
864  // The script parameter is copied and thus can be a temporary.
865  int add_script(const char* script);
866 
867  // Return the enabled property of the given unichar.
868  bool get_enabled(UNICHAR_ID unichar_id) const {
869  ASSERT_HOST(contains_unichar_id(unichar_id));
870  return unichars[unichar_id].properties.enabled;
871  }
872 
873 
874  int null_sid() const { return null_sid_; }
875  int common_sid() const { return common_sid_; }
876  int latin_sid() const { return latin_sid_; }
877  int cyrillic_sid() const { return cyrillic_sid_; }
878  int greek_sid() const { return greek_sid_; }
879  int han_sid() const { return han_sid_; }
880  int hiragana_sid() const { return hiragana_sid_; }
881  int katakana_sid() const { return katakana_sid_; }
882  int thai_sid() const { return thai_sid_; }
883  int hangul_sid() const { return hangul_sid_; }
884  int default_sid() const { return default_sid_; }
885 
886  // Returns true if the unicharset has the concept of upper/lower case.
887  bool script_has_upper_lower() const {
888  return script_has_upper_lower_;
889  }
890  // Returns true if the unicharset has the concept of x-height.
891  // script_has_xheight can be true even if script_has_upper_lower is not,
892  // when the script has a sufficiently predominant top line with ascenders,
893  // such as Devanagari and Thai.
894  bool script_has_xheight() const {
895  return script_has_xheight_;
896  }
897 
898  private:
899 
900  struct UNICHAR_PROPERTIES {
901  UNICHAR_PROPERTIES();
902  // Initializes all properties to sensible default values.
903  void Init();
904  // Sets all ranges wide open. Initialization default in case there are
905  // no useful values available.
906  void SetRangesOpen();
907  // Sets all ranges to empty. Used before expanding with font-based data.
908  void SetRangesEmpty();
909  // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
910  // is empty.
911  bool AnyRangeEmpty() const;
912  // Expands the ranges with the ranges from the src properties.
913  void ExpandRangesFrom(const UNICHAR_PROPERTIES& src);
914  // Copies the properties from src into this.
915  void CopyFrom(const UNICHAR_PROPERTIES& src);
916 
917  bool isalpha;
918  bool islower;
919  bool isupper;
920  bool isdigit;
921  bool ispunctuation;
922  bool isngram;
923  bool enabled;
924  // Possible limits of the top and bottom of the bounding box in
925  // baseline-normalized coordinates, ie, where the baseline is
926  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
927  // (See normalis.h for the definitions).
928  uint8_t min_bottom;
929  uint8_t max_bottom;
930  uint8_t min_top;
931  uint8_t max_top;
932  // Statistics of the widths of bounding box, relative to the median advance.
933  float width;
934  float width_sd;
935  // Stats of the x-bearing and advance, also relative to the median advance.
936  float bearing;
937  float bearing_sd;
938  float advance;
939  float advance_sd;
940  int script_id;
941  UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
942  Direction direction; // direction of this unichar
943  // Mirror property is useful for reverse DAWG lookup for words in
944  // right-to-left languages (e.g. "(word)" would be in
945  // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
946  // However, what we want in our DAWG is
947  // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
948  // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
949  UNICHAR_ID mirror;
950  // A string of unichar_ids that represent the corresponding normed string.
951  // For awkward characters like em-dash, this gives hyphen.
952  // For ligatures, this gives the string of normal unichars.
954  STRING normed; // normalized version of this unichar
955  // Contains meta information about the fragment if a unichar represents
956  // a fragment of a character, otherwise should be set to nullptr.
957  // It is assumed that character fragments are added to the unicharset
958  // after the corresponding 'base' characters.
959  CHAR_FRAGMENT *fragment;
960  };
961 
962  struct UNICHAR_SLOT {
963  char representation[UNICHAR_LEN + 1];
964  UNICHAR_PROPERTIES properties;
965  };
966 
967  // Internal recursive version of encode_string above.
968  // str is the start of the whole string.
969  // str_index is the current position in str.
970  // str_length is the length of str.
971  // encoding is a working encoding of str.
972  // lengths is a working set of lengths of each element of encoding.
973  // best_total_length is the longest length of str that has been successfully
974  // encoded so far.
975  // On return:
976  // best_encoding contains the encoding that used the longest part of str.
977  // best_lengths (may be null) contains the lengths of best_encoding.
978  void encode_string(const char* str, int str_index, int str_length,
979  GenericVector<UNICHAR_ID>* encoding,
980  GenericVector<char>* lengths,
981  int* best_total_length,
982  GenericVector<UNICHAR_ID>* best_encoding,
983  GenericVector<char>* best_lengths) const;
984 
985  // Gets the properties for a grapheme string, combining properties for
986  // multiple characters in a meaningful way where possible.
987  // Returns false if no valid match was found in the unicharset.
988  // NOTE that script_id, mirror, and other_case refer to this unicharset on
989  // return and will need redirecting if the target unicharset is different.
990  bool GetStrProperties(const char* utf8_str,
991  UNICHAR_PROPERTIES* props) const;
992 
993  // Load ourselves from a "file" where our only interface to the file is
994  // an implementation of fgets(). This is the parsing primitive accessed by
995  // the public routines load_from_file().
996  bool load_via_fgets(std::function<char*(char*, int)> fgets_cb,
997  bool skip_fragments);
998 
999  // List of mappings to make when ingesting strings from the outside.
1000  // The substitutions clean up text that should exists for rendering of
1001  // synthetic data, but not in the recognition set.
1002  static const char* kCleanupMaps[][2];
1003  static TESS_API const char* null_script;
1004 
1005  UNICHAR_SLOT* unichars;
1006  UNICHARMAP ids;
1007  int size_used;
1008  int size_reserved;
1009  char** script_table;
1010  int script_table_size_used;
1011  int script_table_size_reserved;
1012  // True if the unichars have their tops/bottoms set.
1013  bool top_bottom_set_;
1014  // True if the unicharset has significant upper/lower case chars.
1015  bool script_has_upper_lower_;
1016  // True if the unicharset has a significant mean-line with significant
1017  // ascenders above that.
1018  bool script_has_xheight_;
1019  // True if the set contains chars that would be changed by the cleanup.
1020  bool old_style_included_;
1021 
1022  // A few convenient script name-to-id mapping without using hash.
1023  // These are initialized when unicharset file is loaded. Anything
1024  // missing from this list can be looked up using get_script_id_from_name.
1025  int null_sid_;
1026  int common_sid_;
1027  int latin_sid_;
1028  int cyrillic_sid_;
1029  int greek_sid_;
1030  int han_sid_;
1031  int hiragana_sid_;
1032  int katakana_sid_;
1033  int thai_sid_;
1034  int hangul_sid_;
1035  // The most frequently occurring script in the charset.
1036  int default_sid_;
1037 };
1038 
1039 #endif // TESSERACT_CCUTIL_UNICHARSET_H_
UNICHARSET::load_from_file
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:378
string
std::string string
Definition: equationdetect_test.cc:21
UNICHARSET::get_direction
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:680
strngs.h
UNICHARSET::set_isupper
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:431
UNICHARSET::U_EUROPEAN_NUMBER
Definition: unicharset.h:159
unicharmap.h
UNICHARSET::UNICHARSET
UNICHARSET()
Definition: unicharset.cpp:175
UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:488
UNICHARSET::AppendOtherUnicharset
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:463
CHAR_FRAGMENT::get_pos
int get_pos() const
Definition: unicharset.h:71
UNICHARSET::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
CHAR_FRAGMENT::kMaxChunks
static const int kMaxChunks
Definition: unicharset.h:55
UNICHARSET::encode_string
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:258
UNICHARSET::set_islower
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:426
UNICHARSET::id_to_unichar_ext
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:298
CHAR_FRAGMENT::is_continuation_of
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:98
UNICHARSET::unichar_insert
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.h:264
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:481
UNICHARSET::get_script_id_from_name
int get_script_id_from_name(const char *script_name) const
Definition: unicharset.cpp:1099
UNICHARSET::encodable_string
bool encodable_string(const char *str, int *first_bad_position) const
Definition: unicharset.cpp:243
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
UNICHARSET::~UNICHARSET
~UNICHARSET()
Definition: unicharset.cpp:190
UNICHARSET::debug_str
STRING debug_str(const char *unichar_repr) const
Definition: unicharset.h:254
UNICHARSET::get_isalpha
bool get_isalpha(const char *const unichar_repr) const
Definition: unicharset.h:731
UNICHARSET::PropertiesIncomplete
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
Definition: unicharset.h:636
UNICHARMAP::contains
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:79
OldUncleanUnichars::kTrue
UNICHARSET::set_normed
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:472
CHAR_FRAGMENT::get_unichar
const char * get_unichar() const
Definition: unicharset.h:70
UNICHARSET::set_direction
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:462
UNICHARSET::U_FIRST_STRONG_ISOLATE
Definition: unicharset.h:176
UNICHARMAP::clear
void clear()
Definition: unicharmap.cpp:115
UNICHARSET::get_advance_stats
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:620
CHAR_FRAGMENT::equals
bool equals(const char *other_unichar, int other_pos, int other_total) const
Definition: unicharset.h:85
STRING
Definition: strngs.h:45
UNICHARSET::get_script_from_script_id
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:844
UNICHARSET::hangul_sid
int hangul_sid() const
Definition: unicharset.h:883
UNICHARSET::U_WHITE_SPACE_NEUTRAL
Definition: unicharset.h:166
UNICHARSET::is_null_script
bool is_null_script(const char *script) const
Definition: unicharset.h:858
UNICHARSET::get_isngram
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:516
UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(const char *const unichar_repr) const
Definition: unicharset.h:774
UNICHARSET::U_OTHER_NEUTRAL
Definition: unicharset.h:167
UNICHARSET::IsSpaceDelimited
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:642
UNICHARSET::U_LEFT_TO_RIGHT
Definition: unicharset.h:157
UNICHARSET::set_normed_ids
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:372
UNICHARSET::U_RIGHT_TO_LEFT_EMBEDDING
Definition: unicharset.h:171
UNICHARSET::set_top_bottom
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:572
UNICHARSET::katakana_sid
int katakana_sid() const
Definition: unicharset.h:881
UNICHARSET::eq
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:686
UNICHARSET::get_width_stats
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:586
UNICHARSET::step
int step(const char *str) const
Definition: unicharset.cpp:232
UNICHARSET::get_normed_unichar
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:818
UNICHARSET::set_bearing_stats
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:613
UNICHARSET::get_isdigit
bool get_isdigit(const char *const unichar_repr, int length) const
Definition: unicharset.h:805
CHAR_FRAGMENT::is_natural
bool is_natural() const
Definition: unicharset.h:113
UNICHARSET::clear
void clear()
Definition: unicharset.h:306
UNICHARMAP
Definition: unicharmap.h:27
UNICHARSET::save_to_file
bool save_to_file(const char *const filename) const
Definition: unicharset.h:350
UNICHARSET::set_isalpha
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:421
UNICHARSET::get_script
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:653
UNICHARSET::kCustomLigatures
static const TESS_API char * kCustomLigatures[][2]
Definition: unicharset.h:150
UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:509
UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR
Definition: unicharset.h:160
UNICHARSET::SetPropertiesFromOther
void SetPropertiesFromOther(const UNICHARSET &src)
Definition: unicharset.h:535
genericvector.h
UNICHARSET::set_width_stats
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:597
OldUncleanUnichars
OldUncleanUnichars
Definition: unicharset.h:43
UNICHARSET::get_islower
bool get_islower(const char *const unichar_repr, int length) const
Definition: unicharset.h:791
CHAR_FRAGMENT::parse_from_string
static CHAR_FRAGMENT * parse_from_string(const char *str)
Definition: unicharset.cpp:1057
UNICHARSET::major_right_to_left
bool major_right_to_left() const
Definition: unicharset.cpp:952
UNICHARSET::add_script
int add_script(const char *script)
Definition: unicharset.cpp:1020
UNICHAR_BROKEN
Definition: unicharset.h:36
UNICHARSET::get_properties
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:601
UNICHARSET::U_DIR_NON_SPACING_MARK
Definition: unicharset.h:174
CHAR_FRAGMENT::set_all
void set_all(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.h:58
UNICHARSET::get_isupper
bool get_isupper(const char *const unichar_repr, int length) const
Definition: unicharset.h:798
UNICHARSET::to_lower
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:694
UNICHARSET::debug_str
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
UNICHARSET::set_advance_stats
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:630
OldUncleanUnichars::kFalse
UNICHARSET::set_black_and_whitelist
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
Definition: unicharset.cpp:969
UNICHARSET::AnyRepeatedUnicodes
bool AnyRepeatedUnicodes() const
Definition: unicharset.cpp:1007
UNICHARSET::get_top_bottom
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:558
UNICHARSET::get_enabled
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:868
file
Definition: include_gunit.h:22
UNICHARSET::null_sid
int null_sid() const
Definition: unicharset.h:874
UNICHARSET::U_COMMON_NUMBER_SEPARATOR
Definition: unicharset.h:163
UNICHARSET::reserve
void reserve(int unichars_number)
Definition: unicharset.cpp:194
UNICHARSET::load_from_file
bool load_from_file(FILE *file)
Definition: unicharset.h:393
UNICHARSET::han_sid
int han_sid() const
Definition: unicharset.h:879
UNICHARSET::U_POP_DIRECTIONAL_FORMAT
Definition: unicharset.h:173
UNICHARSET::set_ranges_empty
void set_ranges_empty()
Definition: unicharset.cpp:395
UNICHARSET::get_isprivate
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:387
UNICHARSET::thai_sid
int thai_sid() const
Definition: unicharset.h:882
UNICHARSET::U_LEFT_TO_RIGHT_OVERRIDE
Definition: unicharset.h:169
UNICHARSET::U_BOUNDARY_NEUTRAL
Definition: unicharset.h:175
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
UNICHARSET::save_to_file
bool save_to_file(tesseract::TFile *file) const
Definition: unicharset.h:366
UNICHAR_SPACE
Definition: unicharset.h:34
UNICHARSET::U_RIGHT_TO_LEFT_ISOLATE
Definition: unicharset.h:178
tesseract::TFile
Definition: serialis.h:75
UNICHARSET::U_RIGHT_TO_LEFT_OVERRIDE
Definition: unicharset.h:172
UNICHARSET::save_to_string
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:691
UNICHARSET::get_properties
unsigned int get_properties(const char *const unichar_repr) const
Definition: unicharset.h:757
UNICHARSET::U_ARABIC_NUMBER
Definition: unicharset.h:162
CHAR_FRAGMENT::set_natural
void set_natural(bool value)
Definition: unicharset.h:114
UNICHARSET
Definition: unicharset.h:145
UNICHARSET::top_bottom_useful
bool top_bottom_useful() const
Definition: unicharset.h:527
CHAR_FRAGMENT::is_ending
bool is_ending() const
Definition: unicharset.h:108
UNICHARSET::script_has_xheight
bool script_has_xheight() const
Definition: unicharset.h:894
UNICHARSET::get_chartype
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:616
UNICHARSET::hiragana_sid
int hiragana_sid() const
Definition: unicharset.h:880
UNICHARSET::set_mirror
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:467
UNICHARSET::latin_sid
int latin_sid() const
Definition: unicharset.h:876
UNICHARSET::get_ispunctuation
bool get_ispunctuation(const char *const unichar_repr) const
Definition: unicharset.h:751
UNICHARSET::script_has_upper_lower
bool script_has_upper_lower() const
Definition: unicharset.h:887
UNICHARSET::get_mirror
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:687
UNICHARSET::to_upper
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:702
UNICHARSET::set_script
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:452
UNICHARSET::CleanupString
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
CHAR_FRAGMENT::kMaxLen
static const int kMaxLen
Definition: unicharset.h:53
helpers.h
SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:38
UNICHARSET::set_ispunctuation
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:441
CHAR_FRAGMENT::get_total
int get_total() const
Definition: unicharset.h:72
UNICHARSET::get_script
int get_script(const char *const unichar_repr) const
Definition: unicharset.h:768
CHAR_FRAGMENT::to_string
STRING to_string() const
Definition: unicharset.h:79
UNICHARSET::unichar_insert_backwards_compatible
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:269
UNICHARSET::save_to_file
bool save_to_file(FILE *file) const
Definition: unicharset.h:360
UNICHARSET::has_special_codes
bool has_special_codes() const
Definition: unicharset.h:712
UNICHARSET::post_load_setup
void post_load_setup()
Definition: unicharset.cpp:886
UNICHARSET::kSpecialUnicharCodes
static const TESS_API char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:153
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
CHAR_FRAGMENT::set_unichar
void set_unichar(const char *uch)
Definition: unicharset.h:64
GenericVector< UNICHAR_ID >
UNICHARSET::common_sid
int common_sid() const
Definition: unicharset.h:875
UNICHARSET::get_script_table_size
int get_script_table_size() const
Definition: unicharset.h:839
UNICHARSET::get_isdigit
bool get_isdigit(const char *const unichar_repr) const
Definition: unicharset.h:746
UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR
Definition: unicharset.h:161
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:495
CHAR_FRAGMENT
Definition: unicharset.h:48
UNICHARSET::U_POP_DIRECTIONAL_ISOLATE
Definition: unicharset.h:179
UNICHARSET::U_BLOCK_SEPARATOR
Definition: unicharset.h:164
CHAR_FRAGMENT::equals
bool equals(const CHAR_FRAGMENT *other) const
Definition: unicharset.h:90
UNICHAR_LEN
#define UNICHAR_LEN
Definition: unichar.h:32
STRING::length
int32_t length() const
Definition: strngs.cpp:187
UNICHARSET::PartialSetPropertiesFromOther
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:404
UNICHARSET::get_isupper
bool get_isupper(const char *const unichar_repr) const
Definition: unicharset.h:741
UNICHARSET::get_ispunctuation
bool get_ispunctuation(const char *const unichar_repr, int length) const
Definition: unicharset.h:812
UNICHARSET::set_isngram
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:446
UNICHARSET::get_isalpha
bool get_isalpha(const char *const unichar_repr, int length) const
Definition: unicharset.h:784
unichar.h
UNICHARSET::Direction
Direction
Definition: unicharset.h:156
UNICHARSET::contains_unichar
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
UNICHARSET::U_RIGHT_TO_LEFT_ARABIC
Definition: unicharset.h:170
UNICHARSET::get_bearing_stats
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:603
UNICHARSET::get_chartype
char get_chartype(const char *const unichar_repr) const
Definition: unicharset.h:761
TESS_API
#define TESS_API
Definition: platform.h:54
UNICHARSET::normed_ids
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:825
UNICHARSET::U_CHAR_DIRECTION_COUNT
Definition: unicharset.h:181
UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:724
errcode.h
UNICHARSET::SizesDistinct
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:485
serialis.h
CHAR_FRAGMENT::set_total
void set_total(int t)
Definition: unicharset.h:69
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
UNICHARSET::get_other_case
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:673
UNICHARSET::load_from_file
bool load_from_file(const char *const filename)
Definition: unicharset.h:386
CHAR_FRAGMENT::is_beginning
bool is_beginning() const
Definition: unicharset.h:105
UNICHARSET::U_SEGMENT_SEPARATOR
Definition: unicharset.h:165
UNICHARSET::U_LEFT_TO_RIGHT_ISOLATE
Definition: unicharset.h:177
UNICHARSET::debug_utf8_str
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:318
UNICHARSET::set_isdigit
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:436
CHAR_FRAGMENT::set_pos
void set_pos(int p)
Definition: unicharset.h:68
UNICHARSET::get_script
int get_script(const char *const unichar_repr, int length) const
Definition: unicharset.h:833
tesseract::Serialize
bool Serialize(FILE *fp, const char *data, size_t n=1)
Definition: serialis.cpp:73
UNICHARSET::cyrillic_sid
int cyrillic_sid() const
Definition: unicharset.h:877
CHAR_FRAGMENT::kMinLen
static const int kMinLen
Definition: unicharset.h:51
UNICHARSET::set_other_case
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:457
UNICHARSET::unichar_insert
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:625
UNICHARSET::size
int size() const
Definition: unicharset.h:341
UNICHARSET::ExpandRangesFromOther
void ExpandRangesFromOther(const UNICHARSET &src)
Definition: unicharset.cpp:434
UNICHARSET::default_sid
int default_sid() const
Definition: unicharset.h:884
UNICHARSET::CopyFrom
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:447
UNICHARSET::get_islower
bool get_islower(const char *const unichar_repr) const
Definition: unicharset.h:736
UNICHARSET::U_RIGHT_TO_LEFT
Definition: unicharset.h:158
UNICHAR_JOINED
Definition: unicharset.h:35
UNICHARSET::greek_sid
int greek_sid() const
Definition: unicharset.h:878
UNICHARSET::U_LEFT_TO_RIGHT_EMBEDDING
Definition: unicharset.h:168
SpecialUnicharCodes
SpecialUnicharCodes
Definition: unicharset.h:33
UNICHARSET::delete_pointers_in_unichars
void delete_pointers_in_unichars()
Definition: unicharset.h:298