tesseract  5.0.0-alpha-619-ge9db
tesseract::UNICHAR Class Reference

#include <unichar.h>

Classes

class  const_iterator
 

Public Member Functions

 UNICHAR ()
 
 UNICHAR (const char *utf8_str, int len)
 
 UNICHAR (int unicode)
 
int first_uni () const
 
int utf8_len () const
 
const char * utf8 () const
 
char * utf8_str () const
 

Static Public Member Functions

static int utf8_step (const char *utf8_str)
 
static const_iterator begin (const char *utf8_str, int byte_length)
 
static const_iterator end (const char *utf8_str, int byte_length)
 
static std::vector< char32UTF8ToUTF32 (const char *utf8_str)
 
static std::string UTF32ToUTF8 (const std::vector< char32 > &str32)
 

Detailed Description

Definition at line 59 of file unichar.h.

Constructor & Destructor Documentation

◆ UNICHAR() [1/3]

tesseract::UNICHAR::UNICHAR ( )
inline

Definition at line 61 of file unichar.h.

61  {
62  memset(chars, 0, UNICHAR_LEN);
63  }

◆ UNICHAR() [2/3]

tesseract::UNICHAR::UNICHAR ( const char *  utf8_str,
int  len 
)

Definition at line 32 of file unichar.cpp.

32  {
33  int total_len = 0;
34  int step = 0;
35  if (len < 0) {
36  for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len);
37  }
38  for (total_len = 0; total_len < len; total_len += step) {
39  step = utf8_step(utf8_str + total_len);
40  if (total_len + step > UNICHAR_LEN)
41  break; // Too long.
42  if (step == 0)
43  break; // Illegal first byte.
44  int i;
45  for (i = 1; i < step; ++i)
46  if ((utf8_str[total_len + i] & 0xc0) != 0x80)
47  break;
48  if (i < step)
49  break; // Illegal surrogate
50  }
51  memcpy(chars, utf8_str, total_len);
52  if (total_len < UNICHAR_LEN) {
53  chars[UNICHAR_LEN - 1] = total_len;
54  while (total_len < UNICHAR_LEN - 1)
55  chars[total_len++] = 0;
56  }
57 }

◆ UNICHAR() [3/3]

tesseract::UNICHAR::UNICHAR ( int  unicode)
explicit

Definition at line 61 of file unichar.cpp.

61  {
62  const int bytemask = 0xBF;
63  const int bytemark = 0x80;
64 
65  if (unicode < 0x80) {
66  chars[UNICHAR_LEN - 1] = 1;
67  chars[2] = 0;
68  chars[1] = 0;
69  chars[0] = static_cast<char>(unicode);
70  } else if (unicode < 0x800) {
71  chars[UNICHAR_LEN - 1] = 2;
72  chars[2] = 0;
73  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
74  unicode >>= 6;
75  chars[0] = static_cast<char>(unicode | 0xc0);
76  } else if (unicode < 0x10000) {
77  chars[UNICHAR_LEN - 1] = 3;
78  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
79  unicode >>= 6;
80  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
81  unicode >>= 6;
82  chars[0] = static_cast<char>(unicode | 0xe0);
83  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
84  chars[UNICHAR_LEN - 1] = 4;
85  chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
86  unicode >>= 6;
87  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
88  unicode >>= 6;
89  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
90  unicode >>= 6;
91  chars[0] = static_cast<char>(unicode | 0xf0);
92  } else {
93  memset(chars, 0, UNICHAR_LEN);
94  }
95 }

Member Function Documentation

◆ begin()

UNICHAR::const_iterator tesseract::UNICHAR::begin ( const char *  utf8_str,
int  byte_length 
)
static

Definition at line 204 of file unichar.cpp.

204  {
205  return UNICHAR::const_iterator(utf8_str);
206 }

◆ end()

UNICHAR::const_iterator tesseract::UNICHAR::end ( const char *  utf8_str,
int  byte_length 
)
static

Definition at line 208 of file unichar.cpp.

208  {
209  return UNICHAR::const_iterator(utf8_str + len);
210 }

◆ first_uni()

int tesseract::UNICHAR::first_uni ( ) const

Definition at line 98 of file unichar.cpp.

98  {
99  static const int utf8_offsets[5] = {
100  0, 0, 0x3080, 0xE2080, 0x3C82080
101  };
102  int uni = 0;
103  int len = utf8_step(chars);
104  const char* src = chars;
105 
106  switch (len) {
107  default:
108  break;
109  case 4:
110  uni += static_cast<unsigned char>(*src++);
111  uni <<= 6;
112  // Fall through.
113  case 3:
114  uni += static_cast<unsigned char>(*src++);
115  uni <<= 6;
116  // Fall through.
117  case 2:
118  uni += static_cast<unsigned char>(*src++);
119  uni <<= 6;
120  // Fall through.
121  case 1:
122  uni += static_cast<unsigned char>(*src++);
123  }
124  uni -= utf8_offsets[len];
125  return uni;
126 }

◆ UTF32ToUTF8()

std::string tesseract::UNICHAR::UTF32ToUTF8 ( const std::vector< char32 > &  str32)
static

Definition at line 232 of file unichar.cpp.

232  {
234  for (char32 ch : str32) {
235  UNICHAR uni_ch(ch);
236  int step;
237  if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
238  utf8_str.append(uni_ch.utf8(), step);
239  } else {
240  return "";
241  }
242  }
243  return utf8_str;
244 }

◆ utf8()

const char* tesseract::UNICHAR::utf8 ( ) const
inline

Definition at line 85 of file unichar.h.

85  {
86  return chars;
87  }

◆ utf8_len()

int tesseract::UNICHAR::utf8_len ( ) const
inline

Definition at line 79 of file unichar.h.

79  {
80  int len = chars[UNICHAR_LEN - 1];
81  return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
82  }

◆ utf8_step()

int tesseract::UNICHAR::utf8_step ( const char *  utf8_str)
static

Definition at line 138 of file unichar.cpp.

138  {
139  static const char utf8_bytes[256] = {
140  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
141  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
142  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
143  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
144  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
145  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
146  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
147  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
148  };
149 
150  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
151 }

◆ utf8_str()

char * tesseract::UNICHAR::utf8_str ( ) const

Definition at line 129 of file unichar.cpp.

129  {
130  int len = utf8_len();
131  char* str = new char[len + 1];
132  memcpy(str, chars, len);
133  str[len] = 0;
134  return str;
135 }

◆ UTF8ToUTF32()

std::vector< char32 > tesseract::UNICHAR::UTF8ToUTF32 ( const char *  utf8_str)
static

Definition at line 215 of file unichar.cpp.

215  {
216  const int utf8_length = strlen(utf8_str);
217  std::vector<char32> unicodes;
218  unicodes.reserve(utf8_length);
219  const_iterator end_it(end(utf8_str, utf8_length));
220  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
221  if (it.is_legal()) {
222  unicodes.push_back(*it);
223  } else {
224  unicodes.clear();
225  return unicodes;
226  }
227  }
228  return unicodes;
229 }

The documentation for this class was generated from the following files:
tesseract::UNICHAR::utf8_len
int utf8_len() const
Definition: unichar.h:79
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::UNICHAR::begin
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:204
tesseract::UNICHAR::end
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:208
tesseract::UNICHAR::UNICHAR
UNICHAR()
Definition: unichar.h:61
tesseract::UNICHAR::utf8_step
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:138
tesseract::UNICHAR::utf8_str
char * utf8_str() const
Definition: unichar.cpp:129
UNICHAR_LEN
#define UNICHAR_LEN
Definition: unichar.h:32
char32
signed int char32
Definition: pango_font_info.h:33
UNI_MAX_LEGAL_UTF32
#define UNI_MAX_LEGAL_UTF32
Definition: unichar.cpp:24