tesseract  5.0.0-alpha-619-ge9db
utf.h File Reference
#include <stdint.h>

Go to the source code of this file.

Typedefs

typedef signed int Rune
 

Enumerations

enum  {
  UTFmax = 4, Runesync = 0x80, Runeself = 0x80, Runeerror = 0xFFFD,
  Runemax = 0x10FFFF
}
 

Functions

int runetochar (char *s, const Rune *r)
 
int chartorune (Rune *r, const char *s)
 
int charntorune (Rune *r, const char *s, int n)
 
int isvalidcharntorune (const char *str, int n, Rune *r, int *consumed)
 
int runelen (Rune r)
 
int runenlen (const Rune *r, int n)
 
int fullrune (const char *s, int n)
 
int utflen (const char *s)
 
int utfnlen (const char *s, long n)
 
const char * utfrune (const char *s, Rune r)
 
const char * utfrrune (const char *s, Rune r)
 
const char * utfutf (const char *s1, const char *s2)
 
char * utfecpy (char *s1, char *es1, const char *s2)
 
Runerunestrcat (Rune *s1, const Rune *s2)
 
Runerunestrncat (Rune *s1, const Rune *s2, long n)
 
const Runerunestrchr (const Rune *s, Rune c)
 
int runestrcmp (const Rune *s1, const Rune *s2)
 
int runestrncmp (const Rune *s1, const Rune *s2, long n)
 
Runerunestrcpy (Rune *s1, const Rune *s2)
 
Runerunestrncpy (Rune *s1, const Rune *s2, long n)
 
Runerunestrecpy (Rune *s1, Rune *es1, const Rune *s2)
 
Runerunestrdup (const Rune *s)
 
const Runerunestrrchr (const Rune *s, Rune c)
 
long runestrlen (const Rune *s)
 
const Runerunestrstr (const Rune *s1, const Rune *s2)
 
Rune toupperrune (Rune r)
 
Rune tolowerrune (Rune r)
 
Rune totitlerune (Rune r)
 
int isupperrune (Rune r)
 
int islowerrune (Rune r)
 
int istitlerune (Rune r)
 
int isalpharune (Rune r)
 
int isdigitrune (Rune r)
 
int isideographicrune (Rune r)
 
int isspacerune (Rune r)
 

Typedef Documentation

◆ Rune

typedef signed int Rune

Definition at line 19 of file utf.h.

Enumeration Type Documentation

◆ anonymous enum

anonymous enum
Enumerator
UTFmax 
Runesync 
Runeself 
Runeerror 
Runemax 

Definition at line 21 of file utf.h.

22 {
23  UTFmax = 4, /* maximum bytes per rune */
24  Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
25  Runeself = 0x80, /* rune and UTF sequences are the same (<) */
26  Runeerror = 0xFFFD, /* decoding error in UTF */
27  Runemax = 0x10FFFF, /* maximum rune value */
28 };

Function Documentation

◆ charntorune()

int charntorune ( Rune r,
const char *  s,
int  n 
)

Definition at line 66 of file rune.c.

67 {
68  int c, c1, c2, c3;
69  long l;
70 
71  /* When we're not allowed to read anything */
72  if(length <= 0) {
73  goto badlen;
74  }
75 
76  /*
77  * one character sequence (7-bit value)
78  * 00000-0007F => T1
79  */
80  c = *(uchar*)str;
81  if(c < Tx) {
82  *rune = c;
83  return 1;
84  }
85 
86  // If we can't read more than one character we must stop
87  if(length <= 1) {
88  goto badlen;
89  }
90 
91  /*
92  * two character sequence (11-bit value)
93  * 0080-07FF => T2 Tx
94  */
95  c1 = *(uchar*)(str+1) ^ Tx;
96  if(c1 & Testx)
97  goto bad;
98  if(c < T3) {
99  if(c < T2)
100  goto bad;
101  l = ((c << Bitx) | c1) & Rune2;
102  if(l <= Rune1)
103  goto bad;
104  *rune = l;
105  return 2;
106  }
107 
108  // If we can't read more than two characters we must stop
109  if(length <= 2) {
110  goto badlen;
111  }
112 
113  /*
114  * three character sequence (16-bit value)
115  * 0800-FFFF => T3 Tx Tx
116  */
117  c2 = *(uchar*)(str+2) ^ Tx;
118  if(c2 & Testx)
119  goto bad;
120  if(c < T4) {
121  l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
122  if(l <= Rune2)
123  goto bad;
124  *rune = l;
125  return 3;
126  }
127 
128  if (length <= 3)
129  goto badlen;
130 
131  /*
132  * four character sequence (21-bit value)
133  * 10000-1FFFFF => T4 Tx Tx Tx
134  */
135  c3 = *(uchar*)(str+3) ^ Tx;
136  if (c3 & Testx)
137  goto bad;
138  if (c < T5) {
139  l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
140  if (l <= Rune3)
141  goto bad;
142  if (l > Runemax)
143  goto bad;
144  *rune = l;
145  return 4;
146  }
147 
148  // Support for 5-byte or longer UTF-8 would go here, but
149  // since we don't have that, we'll just fall through to bad.
150 
151  /*
152  * bad decoding
153  */
154 bad:
155  *rune = Bad;
156  return 1;
157 badlen:
158  *rune = Bad;
159  return 0;
160 
161 }

◆ chartorune()

int chartorune ( Rune r,
const char *  s 
)

Definition at line 169 of file rune.c.

170 {
171  int c, c1, c2, c3;
172  long l;
173 
174  /*
175  * one character sequence
176  * 00000-0007F => T1
177  */
178  c = *(uchar*)str;
179  if(c < Tx) {
180  *rune = c;
181  return 1;
182  }
183 
184  /*
185  * two character sequence
186  * 0080-07FF => T2 Tx
187  */
188  c1 = *(uchar*)(str+1) ^ Tx;
189  if(c1 & Testx)
190  goto bad;
191  if(c < T3) {
192  if(c < T2)
193  goto bad;
194  l = ((c << Bitx) | c1) & Rune2;
195  if(l <= Rune1)
196  goto bad;
197  *rune = l;
198  return 2;
199  }
200 
201  /*
202  * three character sequence
203  * 0800-FFFF => T3 Tx Tx
204  */
205  c2 = *(uchar*)(str+2) ^ Tx;
206  if(c2 & Testx)
207  goto bad;
208  if(c < T4) {
209  l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
210  if(l <= Rune2)
211  goto bad;
212  *rune = l;
213  return 3;
214  }
215 
216  /*
217  * four character sequence (21-bit value)
218  * 10000-1FFFFF => T4 Tx Tx Tx
219  */
220  c3 = *(uchar*)(str+3) ^ Tx;
221  if (c3 & Testx)
222  goto bad;
223  if (c < T5) {
224  l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
225  if (l <= Rune3)
226  goto bad;
227  if (l > Runemax)
228  goto bad;
229  *rune = l;
230  return 4;
231  }
232 
233  /*
234  * Support for 5-byte or longer UTF-8 would go here, but
235  * since we don't have that, we'll just fall through to bad.
236  */
237 
238  /*
239  * bad decoding
240  */
241 bad:
242  *rune = Bad;
243  return 1;
244 }

◆ fullrune()

int fullrune ( const char *  s,
int  n 
)

Definition at line 341 of file rune.c.

342 {
343  if (n > 0) {
344  int c = *(uchar*)str;
345  if (c < Tx)
346  return 1;
347  if (n > 1) {
348  if (c < T3)
349  return 1;
350  if (n > 2) {
351  if (c < T4 || n > 3)
352  return 1;
353  }
354  }
355  }
356  return 0;
357 }

◆ isalpharune()

int isalpharune ( Rune  r)

◆ isdigitrune()

int isdigitrune ( Rune  r)

◆ isideographicrune()

int isideographicrune ( Rune  r)

◆ islowerrune()

int islowerrune ( Rune  r)

◆ isspacerune()

int isspacerune ( Rune  r)

◆ istitlerune()

int istitlerune ( Rune  r)

◆ isupperrune()

int isupperrune ( Rune  r)

◆ isvalidcharntorune()

int isvalidcharntorune ( const char *  str,
int  n,
Rune r,
int *  consumed 
)

Definition at line 247 of file rune.c.

247  {
248  *consumed = charntorune(rune, str, length);
249  return *rune != Runeerror || *consumed == 3;
250 }

◆ runelen()

int runelen ( Rune  r)

Definition at line 310 of file rune.c.

311 {
312  char str[10];
313 
314  return runetochar(str, &rune);
315 }

◆ runenlen()

int runenlen ( const Rune r,
int  n 
)

Definition at line 318 of file rune.c.

319 {
320  int nb;
321  ulong c; /* Rune is signed, so use unsigned for range check. */
322 
323  nb = 0;
324  while(nrune--) {
325  c = *r++;
326  if (c <= Rune1)
327  nb++;
328  else if (c <= Rune2)
329  nb += 2;
330  else if (c <= Rune3)
331  nb += 3;
332  else if (c <= Runemax)
333  nb += 4;
334  else
335  nb += 3; /* Runeerror = 0xFFFD, see runetochar */
336  }
337  return nb;
338 }

◆ runestrcat()

Rune* runestrcat ( Rune s1,
const Rune s2 
)

◆ runestrchr()

const Rune* runestrchr ( const Rune s,
Rune  c 
)

◆ runestrcmp()

int runestrcmp ( const Rune s1,
const Rune s2 
)

◆ runestrcpy()

Rune* runestrcpy ( Rune s1,
const Rune s2 
)

◆ runestrdup()

Rune* runestrdup ( const Rune s)

◆ runestrecpy()

Rune* runestrecpy ( Rune s1,
Rune es1,
const Rune s2 
)

◆ runestrlen()

long runestrlen ( const Rune s)

◆ runestrncat()

Rune* runestrncat ( Rune s1,
const Rune s2,
long  n 
)

◆ runestrncmp()

int runestrncmp ( const Rune s1,
const Rune s2,
long  n 
)

◆ runestrncpy()

Rune* runestrncpy ( Rune s1,
const Rune s2,
long  n 
)

◆ runestrrchr()

const Rune* runestrrchr ( const Rune s,
Rune  c 
)

◆ runestrstr()

const Rune* runestrstr ( const Rune s1,
const Rune s2 
)

◆ runetochar()

int runetochar ( char *  s,
const Rune r 
)

Definition at line 253 of file rune.c.

254 {
255  /* Runes are signed, so convert to unsigned for range check. */
256  unsigned long c;
257 
258  /*
259  * one character sequence
260  * 00000-0007F => 00-7F
261  */
262  c = *rune;
263  if(c <= Rune1) {
264  str[0] = c;
265  return 1;
266  }
267 
268  /*
269  * two character sequence
270  * 0080-07FF => T2 Tx
271  */
272  if(c <= Rune2) {
273  str[0] = T2 | (c >> 1*Bitx);
274  str[1] = Tx | (c & Maskx);
275  return 2;
276  }
277 
278  /*
279  * If the Rune is out of range, convert it to the error rune.
280  * Do this test here because the error rune encodes to three bytes.
281  * Doing it earlier would duplicate work, since an out of range
282  * Rune wouldn't have fit in one or two bytes.
283  */
284  if (c > Runemax)
285  c = Runeerror;
286 
287  /*
288  * three character sequence
289  * 0800-FFFF => T3 Tx Tx
290  */
291  if (c <= Rune3) {
292  str[0] = T3 | (c >> 2*Bitx);
293  str[1] = Tx | ((c >> 1*Bitx) & Maskx);
294  str[2] = Tx | (c & Maskx);
295  return 3;
296  }
297 
298  /*
299  * four character sequence (21-bit value)
300  * 10000-1FFFFF => T4 Tx Tx Tx
301  */
302  str[0] = T4 | (c >> 3*Bitx);
303  str[1] = Tx | ((c >> 2*Bitx) & Maskx);
304  str[2] = Tx | ((c >> 1*Bitx) & Maskx);
305  str[3] = Tx | (c & Maskx);
306  return 4;
307 }

◆ tolowerrune()

Rune tolowerrune ( Rune  r)

◆ totitlerune()

Rune totitlerune ( Rune  r)

◆ toupperrune()

Rune toupperrune ( Rune  r)

◆ utfecpy()

char* utfecpy ( char *  s1,
char *  es1,
const char *  s2 
)

◆ utflen()

int utflen ( const char *  s)

◆ utfnlen()

int utfnlen ( const char *  s,
long  n 
)

◆ utfrrune()

const char* utfrrune ( const char *  s,
Rune  r 
)

◆ utfrune()

const char* utfrune ( const char *  s,
Rune  r 
)

◆ utfutf()

const char* utfutf ( const char *  s1,
const char *  s2 
)
Runeself
Definition: utf.h:25
Testx
Definition: rune.c:42
charntorune
int charntorune(Rune *rune, const char *str, int length)
Definition: rune.c:66
Runeerror
Definition: utf.h:26
Maskx
Definition: rune.c:41
Bad
Definition: rune.c:44
Rune2
Definition: rune.c:36
Runemax
Definition: utf.h:27
uchar
unsigned char uchar
Definition: utfdef.h:8
Rune3
Definition: rune.c:37
UTFmax
Definition: utf.h:23
T5
Definition: rune.c:33
Tx
Definition: rune.c:29
Runesync
Definition: utf.h:24
ulong
unsigned long ulong
Definition: utfdef.h:11
T2
Definition: rune.c:30
Bitx
Definition: rune.c:22
T4
Definition: rune.c:32
T3
Definition: rune.c:31
Rune1
Definition: rune.c:35
Rune4
Definition: rune.c:38
runetochar
int runetochar(char *str, const Rune *rune)
Definition: rune.c:253