tesseract  5.0.0-alpha-619-ge9db
rune.c File Reference
#include <stdarg.h>
#include <string.h>
#include "third_party/utf/utf.h"
#include "third_party/utf/utfdef.h"

Go to the source code of this file.

Enumerations

enum  {
  Bit1 = 7, Bitx = 6, Bit2 = 5, Bit3 = 4,
  Bit4 = 3, Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, Tx = ((1<<(Bitx+1))-1) ^ 0xFF,
  T2 = ((1<<(Bit2+1))-1) ^ 0xFF, T3 = ((1<<(Bit3+1))-1) ^ 0xFF, T4 = ((1<<(Bit4+1))-1) ^ 0xFF, T5 = ((1<<(Bit5+1))-1) ^ 0xFF,
  Rune1 = (1<<(Bit1+0*Bitx))-1, Rune2 = (1<<(Bit2+1*Bitx))-1, Rune3 = (1<<(Bit3+2*Bitx))-1, Rune4 = (1<<(Bit4+3*Bitx))-1,
  Maskx = (1<<Bitx)-1, Testx = Maskx ^ 0xFF, Bad = Runeerror
}
 

Functions

int charntorune (Rune *rune, const char *str, int length)
 
int chartorune (Rune *rune, const char *str)
 
int isvalidcharntorune (const char *str, int length, Rune *rune, int *consumed)
 
int runetochar (char *str, const Rune *rune)
 
int runelen (Rune rune)
 
int runenlen (const Rune *r, int nrune)
 
int fullrune (const char *str, int n)
 

Enumeration Type Documentation

◆ anonymous enum

anonymous enum
Enumerator
Bit1 
Bitx 
Bit2 
Bit3 
Bit4 
Bit5 
T1 
Tx 
T2 
T3 
T4 
T5 
Rune1 
Rune2 
Rune3 
Rune4 
Maskx 
Testx 
Bad 

Definition at line 19 of file rune.c.

20 {
21  Bit1 = 7,
22  Bitx = 6,
23  Bit2 = 5,
24  Bit3 = 4,
25  Bit4 = 3,
26  Bit5 = 2,
27 
28  T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
29  Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
30  T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
31  T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
32  T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
33  T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
34 
35  Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
36  Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
37  Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
38  Rune4 = (1<<(Bit4+3*Bitx))-1,
39  /* 0001 1111 1111 1111 1111 1111 */
40 
41  Maskx = (1<<Bitx)-1, /* 0011 1111 */
42  Testx = Maskx ^ 0xFF, /* 1100 0000 */
43 
44  Bad = Runeerror,
45 };

Function Documentation

◆ charntorune()

int charntorune ( Rune rune,
const char *  str,
int  length 
)

Definition at line 66 of file rune.c.

67 {
68  int c, c1, c2, c3;
69  long l;
70 
71  /* When we're not allowed to read anything */
72  if(length <= 0) {
73  goto badlen;
74  }
75 
76  /*
77  * one character sequence (7-bit value)
78  * 00000-0007F => T1
79  */
80  c = *(uchar*)str;
81  if(c < Tx) {
82  *rune = c;
83  return 1;
84  }
85 
86  // If we can't read more than one character we must stop
87  if(length <= 1) {
88  goto badlen;
89  }
90 
91  /*
92  * two character sequence (11-bit value)
93  * 0080-07FF => T2 Tx
94  */
95  c1 = *(uchar*)(str+1) ^ Tx;
96  if(c1 & Testx)
97  goto bad;
98  if(c < T3) {
99  if(c < T2)
100  goto bad;
101  l = ((c << Bitx) | c1) & Rune2;
102  if(l <= Rune1)
103  goto bad;
104  *rune = l;
105  return 2;
106  }
107 
108  // If we can't read more than two characters we must stop
109  if(length <= 2) {
110  goto badlen;
111  }
112 
113  /*
114  * three character sequence (16-bit value)
115  * 0800-FFFF => T3 Tx Tx
116  */
117  c2 = *(uchar*)(str+2) ^ Tx;
118  if(c2 & Testx)
119  goto bad;
120  if(c < T4) {
121  l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
122  if(l <= Rune2)
123  goto bad;
124  *rune = l;
125  return 3;
126  }
127 
128  if (length <= 3)
129  goto badlen;
130 
131  /*
132  * four character sequence (21-bit value)
133  * 10000-1FFFFF => T4 Tx Tx Tx
134  */
135  c3 = *(uchar*)(str+3) ^ Tx;
136  if (c3 & Testx)
137  goto bad;
138  if (c < T5) {
139  l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
140  if (l <= Rune3)
141  goto bad;
142  if (l > Runemax)
143  goto bad;
144  *rune = l;
145  return 4;
146  }
147 
148  // Support for 5-byte or longer UTF-8 would go here, but
149  // since we don't have that, we'll just fall through to bad.
150 
151  /*
152  * bad decoding
153  */
154 bad:
155  *rune = Bad;
156  return 1;
157 badlen:
158  *rune = Bad;
159  return 0;
160 
161 }

◆ chartorune()

int chartorune ( Rune rune,
const char *  str 
)

Definition at line 169 of file rune.c.

170 {
171  int c, c1, c2, c3;
172  long l;
173 
174  /*
175  * one character sequence
176  * 00000-0007F => T1
177  */
178  c = *(uchar*)str;
179  if(c < Tx) {
180  *rune = c;
181  return 1;
182  }
183 
184  /*
185  * two character sequence
186  * 0080-07FF => T2 Tx
187  */
188  c1 = *(uchar*)(str+1) ^ Tx;
189  if(c1 & Testx)
190  goto bad;
191  if(c < T3) {
192  if(c < T2)
193  goto bad;
194  l = ((c << Bitx) | c1) & Rune2;
195  if(l <= Rune1)
196  goto bad;
197  *rune = l;
198  return 2;
199  }
200 
201  /*
202  * three character sequence
203  * 0800-FFFF => T3 Tx Tx
204  */
205  c2 = *(uchar*)(str+2) ^ Tx;
206  if(c2 & Testx)
207  goto bad;
208  if(c < T4) {
209  l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
210  if(l <= Rune2)
211  goto bad;
212  *rune = l;
213  return 3;
214  }
215 
216  /*
217  * four character sequence (21-bit value)
218  * 10000-1FFFFF => T4 Tx Tx Tx
219  */
220  c3 = *(uchar*)(str+3) ^ Tx;
221  if (c3 & Testx)
222  goto bad;
223  if (c < T5) {
224  l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
225  if (l <= Rune3)
226  goto bad;
227  if (l > Runemax)
228  goto bad;
229  *rune = l;
230  return 4;
231  }
232 
233  /*
234  * Support for 5-byte or longer UTF-8 would go here, but
235  * since we don't have that, we'll just fall through to bad.
236  */
237 
238  /*
239  * bad decoding
240  */
241 bad:
242  *rune = Bad;
243  return 1;
244 }

◆ fullrune()

int fullrune ( const char *  str,
int  n 
)

Definition at line 341 of file rune.c.

342 {
343  if (n > 0) {
344  int c = *(uchar*)str;
345  if (c < Tx)
346  return 1;
347  if (n > 1) {
348  if (c < T3)
349  return 1;
350  if (n > 2) {
351  if (c < T4 || n > 3)
352  return 1;
353  }
354  }
355  }
356  return 0;
357 }

◆ isvalidcharntorune()

int isvalidcharntorune ( const char *  str,
int  length,
Rune rune,
int *  consumed 
)

Definition at line 247 of file rune.c.

247  {
248  *consumed = charntorune(rune, str, length);
249  return *rune != Runeerror || *consumed == 3;
250 }

◆ runelen()

int runelen ( Rune  rune)

Definition at line 310 of file rune.c.

311 {
312  char str[10];
313 
314  return runetochar(str, &rune);
315 }

◆ runenlen()

int runenlen ( const Rune r,
int  nrune 
)

Definition at line 318 of file rune.c.

319 {
320  int nb;
321  ulong c; /* Rune is signed, so use unsigned for range check. */
322 
323  nb = 0;
324  while(nrune--) {
325  c = *r++;
326  if (c <= Rune1)
327  nb++;
328  else if (c <= Rune2)
329  nb += 2;
330  else if (c <= Rune3)
331  nb += 3;
332  else if (c <= Runemax)
333  nb += 4;
334  else
335  nb += 3; /* Runeerror = 0xFFFD, see runetochar */
336  }
337  return nb;
338 }

◆ runetochar()

int runetochar ( char *  str,
const Rune rune 
)

Definition at line 253 of file rune.c.

254 {
255  /* Runes are signed, so convert to unsigned for range check. */
256  unsigned long c;
257 
258  /*
259  * one character sequence
260  * 00000-0007F => 00-7F
261  */
262  c = *rune;
263  if(c <= Rune1) {
264  str[0] = c;
265  return 1;
266  }
267 
268  /*
269  * two character sequence
270  * 0080-07FF => T2 Tx
271  */
272  if(c <= Rune2) {
273  str[0] = T2 | (c >> 1*Bitx);
274  str[1] = Tx | (c & Maskx);
275  return 2;
276  }
277 
278  /*
279  * If the Rune is out of range, convert it to the error rune.
280  * Do this test here because the error rune encodes to three bytes.
281  * Doing it earlier would duplicate work, since an out of range
282  * Rune wouldn't have fit in one or two bytes.
283  */
284  if (c > Runemax)
285  c = Runeerror;
286 
287  /*
288  * three character sequence
289  * 0800-FFFF => T3 Tx Tx
290  */
291  if (c <= Rune3) {
292  str[0] = T3 | (c >> 2*Bitx);
293  str[1] = Tx | ((c >> 1*Bitx) & Maskx);
294  str[2] = Tx | (c & Maskx);
295  return 3;
296  }
297 
298  /*
299  * four character sequence (21-bit value)
300  * 10000-1FFFFF => T4 Tx Tx Tx
301  */
302  str[0] = T4 | (c >> 3*Bitx);
303  str[1] = Tx | ((c >> 2*Bitx) & Maskx);
304  str[2] = Tx | ((c >> 1*Bitx) & Maskx);
305  str[3] = Tx | (c & Maskx);
306  return 4;
307 }
T1
Definition: rune.c:28
Testx
Definition: rune.c:42
charntorune
int charntorune(Rune *rune, const char *str, int length)
Definition: rune.c:66
Runeerror
Definition: utf.h:26
Maskx
Definition: rune.c:41
Bad
Definition: rune.c:44
Rune2
Definition: rune.c:36
Runemax
Definition: utf.h:27
uchar
unsigned char uchar
Definition: utfdef.h:8
Bit2
Definition: rune.c:23
Rune3
Definition: rune.c:37
T5
Definition: rune.c:33
Tx
Definition: rune.c:29
ulong
unsigned long ulong
Definition: utfdef.h:11
Bit4
Definition: rune.c:25
T2
Definition: rune.c:30
Bit3
Definition: rune.c:24
Bitx
Definition: rune.c:22
T4
Definition: rune.c:32
T3
Definition: rune.c:31
Rune1
Definition: rune.c:35
Bit1
Definition: rune.c:21
Rune4
Definition: rune.c:38
runetochar
int runetochar(char *str, const Rune *rune)
Definition: rune.c:253
Bit5
Definition: rune.c:26