tesseract  5.0.0-alpha-619-ge9db
rune.c
Go to the documentation of this file.
1 /*
2  * The authors of this software are Rob Pike and Ken Thompson.
3  * Copyright (c) 2002 by Lucent Technologies.
4  * Permission to use, copy, modify, and distribute this software for any
5  * purpose without fee is hereby granted, provided that this entire notice
6  * is included in all copies of any software which is or includes a copy
7  * or modification of this software and in all copies of the supporting
8  * documentation for such software.
9  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10  * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12  * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13  */
14 #include <stdarg.h>
15 #include <string.h>
16 #include "third_party/utf/utf.h"
17 #include "third_party/utf/utfdef.h"
18 
19 enum
20 {
21  Bit1 = 7,
22  Bitx = 6,
23  Bit2 = 5,
24  Bit3 = 4,
25  Bit4 = 3,
26  Bit5 = 2,
27 
28  T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
29  Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
30  T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
31  T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
32  T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
33  T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
34 
35  Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
36  Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
37  Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
38  Rune4 = (1<<(Bit4+3*Bitx))-1,
39  /* 0001 1111 1111 1111 1111 1111 */
40 
41  Maskx = (1<<Bitx)-1, /* 0011 1111 */
42  Testx = Maskx ^ 0xFF, /* 1100 0000 */
43 
45 };
46 
47 /*
48  * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
49  * This is a slower but "safe" version of the old chartorune
50  * that works on strings that are not necessarily null-terminated.
51  *
52  * If you know for sure that your string is null-terminated,
53  * chartorune will be a bit faster.
54  *
55  * It is guaranteed not to attempt to access "length"
56  * past the incoming pointer. This is to avoid
57  * possible access violations. If the string appears to be
58  * well-formed but incomplete (i.e., to get the whole Rune
59  * we'd need to read past str+length) then we'll set the Rune
60  * to Bad and return 0.
61  *
62  * Note that if we have decoding problems for other
63  * reasons, we return 1 instead of 0.
64  */
65 int
66 charntorune(Rune *rune, const char *str, int length)
67 {
68  int c, c1, c2, c3;
69  long l;
70 
71  /* When we're not allowed to read anything */
72  if(length <= 0) {
73  goto badlen;
74  }
75 
76  /*
77  * one character sequence (7-bit value)
78  * 00000-0007F => T1
79  */
80  c = *(uchar*)str;
81  if(c < Tx) {
82  *rune = c;
83  return 1;
84  }
85 
86  // If we can't read more than one character we must stop
87  if(length <= 1) {
88  goto badlen;
89  }
90 
91  /*
92  * two character sequence (11-bit value)
93  * 0080-07FF => T2 Tx
94  */
95  c1 = *(uchar*)(str+1) ^ Tx;
96  if(c1 & Testx)
97  goto bad;
98  if(c < T3) {
99  if(c < T2)
100  goto bad;
101  l = ((c << Bitx) | c1) & Rune2;
102  if(l <= Rune1)
103  goto bad;
104  *rune = l;
105  return 2;
106  }
107 
108  // If we can't read more than two characters we must stop
109  if(length <= 2) {
110  goto badlen;
111  }
112 
113  /*
114  * three character sequence (16-bit value)
115  * 0800-FFFF => T3 Tx Tx
116  */
117  c2 = *(uchar*)(str+2) ^ Tx;
118  if(c2 & Testx)
119  goto bad;
120  if(c < T4) {
121  l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
122  if(l <= Rune2)
123  goto bad;
124  *rune = l;
125  return 3;
126  }
127 
128  if (length <= 3)
129  goto badlen;
130 
131  /*
132  * four character sequence (21-bit value)
133  * 10000-1FFFFF => T4 Tx Tx Tx
134  */
135  c3 = *(uchar*)(str+3) ^ Tx;
136  if (c3 & Testx)
137  goto bad;
138  if (c < T5) {
139  l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
140  if (l <= Rune3)
141  goto bad;
142  if (l > Runemax)
143  goto bad;
144  *rune = l;
145  return 4;
146  }
147 
148  // Support for 5-byte or longer UTF-8 would go here, but
149  // since we don't have that, we'll just fall through to bad.
150 
151  /*
152  * bad decoding
153  */
154 bad:
155  *rune = Bad;
156  return 1;
157 badlen:
158  *rune = Bad;
159  return 0;
160 
161 }
162 
163 
164 /*
165  * This is the older "unsafe" version, which works fine on
166  * null-terminated strings.
167  */
168 int
169 chartorune(Rune *rune, const char *str)
170 {
171  int c, c1, c2, c3;
172  long l;
173 
174  /*
175  * one character sequence
176  * 00000-0007F => T1
177  */
178  c = *(uchar*)str;
179  if(c < Tx) {
180  *rune = c;
181  return 1;
182  }
183 
184  /*
185  * two character sequence
186  * 0080-07FF => T2 Tx
187  */
188  c1 = *(uchar*)(str+1) ^ Tx;
189  if(c1 & Testx)
190  goto bad;
191  if(c < T3) {
192  if(c < T2)
193  goto bad;
194  l = ((c << Bitx) | c1) & Rune2;
195  if(l <= Rune1)
196  goto bad;
197  *rune = l;
198  return 2;
199  }
200 
201  /*
202  * three character sequence
203  * 0800-FFFF => T3 Tx Tx
204  */
205  c2 = *(uchar*)(str+2) ^ Tx;
206  if(c2 & Testx)
207  goto bad;
208  if(c < T4) {
209  l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
210  if(l <= Rune2)
211  goto bad;
212  *rune = l;
213  return 3;
214  }
215 
216  /*
217  * four character sequence (21-bit value)
218  * 10000-1FFFFF => T4 Tx Tx Tx
219  */
220  c3 = *(uchar*)(str+3) ^ Tx;
221  if (c3 & Testx)
222  goto bad;
223  if (c < T5) {
224  l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
225  if (l <= Rune3)
226  goto bad;
227  if (l > Runemax)
228  goto bad;
229  *rune = l;
230  return 4;
231  }
232 
233  /*
234  * Support for 5-byte or longer UTF-8 would go here, but
235  * since we don't have that, we'll just fall through to bad.
236  */
237 
238  /*
239  * bad decoding
240  */
241 bad:
242  *rune = Bad;
243  return 1;
244 }
245 
246 int
247 isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
248  *consumed = charntorune(rune, str, length);
249  return *rune != Runeerror || *consumed == 3;
250 }
251 
252 int
253 runetochar(char *str, const Rune *rune)
254 {
255  /* Runes are signed, so convert to unsigned for range check. */
256  unsigned long c;
257 
258  /*
259  * one character sequence
260  * 00000-0007F => 00-7F
261  */
262  c = *rune;
263  if(c <= Rune1) {
264  str[0] = c;
265  return 1;
266  }
267 
268  /*
269  * two character sequence
270  * 0080-07FF => T2 Tx
271  */
272  if(c <= Rune2) {
273  str[0] = T2 | (c >> 1*Bitx);
274  str[1] = Tx | (c & Maskx);
275  return 2;
276  }
277 
278  /*
279  * If the Rune is out of range, convert it to the error rune.
280  * Do this test here because the error rune encodes to three bytes.
281  * Doing it earlier would duplicate work, since an out of range
282  * Rune wouldn't have fit in one or two bytes.
283  */
284  if (c > Runemax)
285  c = Runeerror;
286 
287  /*
288  * three character sequence
289  * 0800-FFFF => T3 Tx Tx
290  */
291  if (c <= Rune3) {
292  str[0] = T3 | (c >> 2*Bitx);
293  str[1] = Tx | ((c >> 1*Bitx) & Maskx);
294  str[2] = Tx | (c & Maskx);
295  return 3;
296  }
297 
298  /*
299  * four character sequence (21-bit value)
300  * 10000-1FFFFF => T4 Tx Tx Tx
301  */
302  str[0] = T4 | (c >> 3*Bitx);
303  str[1] = Tx | ((c >> 2*Bitx) & Maskx);
304  str[2] = Tx | ((c >> 1*Bitx) & Maskx);
305  str[3] = Tx | (c & Maskx);
306  return 4;
307 }
308 
309 int
311 {
312  char str[10];
313 
314  return runetochar(str, &rune);
315 }
316 
317 int
318 runenlen(const Rune *r, int nrune)
319 {
320  int nb;
321  ulong c; /* Rune is signed, so use unsigned for range check. */
322 
323  nb = 0;
324  while(nrune--) {
325  c = *r++;
326  if (c <= Rune1)
327  nb++;
328  else if (c <= Rune2)
329  nb += 2;
330  else if (c <= Rune3)
331  nb += 3;
332  else if (c <= Runemax)
333  nb += 4;
334  else
335  nb += 3; /* Runeerror = 0xFFFD, see runetochar */
336  }
337  return nb;
338 }
339 
340 int
341 fullrune(const char *str, int n)
342 {
343  if (n > 0) {
344  int c = *(uchar*)str;
345  if (c < Tx)
346  return 1;
347  if (n > 1) {
348  if (c < T3)
349  return 1;
350  if (n > 2) {
351  if (c < T4 || n > 3)
352  return 1;
353  }
354  }
355  }
356  return 0;
357 }
T1
Definition: rune.c:28
Testx
Definition: rune.c:42
charntorune
int charntorune(Rune *rune, const char *str, int length)
Definition: rune.c:66
Runeerror
Definition: utf.h:26
Maskx
Definition: rune.c:41
Bad
Definition: rune.c:44
Rune2
Definition: rune.c:36
Runemax
Definition: utf.h:27
utfdef.h
Rune
signed int Rune
Definition: utf.h:19
uchar
unsigned char uchar
Definition: utfdef.h:8
runenlen
int runenlen(const Rune *r, int nrune)
Definition: rune.c:318
Bit2
Definition: rune.c:23
Rune3
Definition: rune.c:37
chartorune
int chartorune(Rune *rune, const char *str)
Definition: rune.c:169
T5
Definition: rune.c:33
Tx
Definition: rune.c:29
ulong
unsigned long ulong
Definition: utfdef.h:11
Bit4
Definition: rune.c:25
T2
Definition: rune.c:30
Bit3
Definition: rune.c:24
fullrune
int fullrune(const char *str, int n)
Definition: rune.c:341
runelen
int runelen(Rune rune)
Definition: rune.c:310
Bitx
Definition: rune.c:22
T4
Definition: rune.c:32
T3
Definition: rune.c:31
Rune1
Definition: rune.c:35
isvalidcharntorune
int isvalidcharntorune(const char *str, int length, Rune *rune, int *consumed)
Definition: rune.c:247
Bit1
Definition: rune.c:21
Rune4
Definition: rune.c:38
utf.h
runetochar
int runetochar(char *str, const Rune *rune)
Definition: rune.c:253
Bit5
Definition: rune.c:26