tesseract  4.0.0-1-g2a2b
scanutils.cpp
Go to the documentation of this file.
1 // Copyright 2006 Google Inc.
2 // All Rights Reserved.
3 // Author: renn
4 //
5 // The fscanf, vfscanf and creat functions are implemented so that their
6 // functionality is mostly like their stdio counterparts. However, currently
7 // these functions do not use any buffering, making them rather slow.
8 // File streams are thus processed one character at a time.
9 // Although the implementations of the scanf functions do lack a few minor
10 // features, they should be sufficient for their use in tesseract.
11 //
12 // Licensed under the Apache License, Version 2.0 (the "License");
13 // you may not use this file except in compliance with the License.
14 // You may obtain a copy of the License at
15 // http://www.apache.org/licenses/LICENSE-2.0
16 // Unless required by applicable law or agreed to in writing, software
17 // distributed under the License is distributed on an "AS IS" BASIS,
18 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 // See the License for the specific language governing permissions and
20 // limitations under the License.
21 
22 #ifdef HAVE_CONFIG_H
23 #include "config_auto.h"
24 #endif
25 
26 #include <cctype>
27 #include <cmath>
28 #include <cstdarg>
29 #include <cstddef>
30 #include <cstring>
31 #include <climits>
32 #include <cstdio>
33 #include <limits>
34 #include <sys/types.h>
35 #include <sys/stat.h>
36 #include <fcntl.h>
37 
38 #include "scanutils.h"
39 #include "tprintf.h"
40 
41 enum Flags {
42  FL_SPLAT = 0x01, // Drop the value, do not assign
43  FL_INV = 0x02, // Character-set with inverse
44  FL_WIDTH = 0x04, // Field width specified
45  FL_MINUS = 0x08, // Negative number
46 };
47 
48 enum Ranks {
49  RANK_CHAR = -2,
50  RANK_SHORT = -1,
51  RANK_INT = 0,
52  RANK_LONG = 1,
54  RANK_PTR = std::numeric_limits<int>::max() // Special value used for pointers
55 };
56 
57 const enum Ranks kMinRank = RANK_CHAR;
59 
61 const enum Ranks kSizeTRank = RANK_LONG;
63 
64 enum Bail {
65  BAIL_NONE = 0, // No error condition
66  BAIL_EOF, // Hit EOF
67  BAIL_ERR // Conversion mismatch
68 };
69 
70 // Helper functions ------------------------------------------------------------
71 inline size_t LongBit() {
72  return CHAR_BIT * sizeof(long);
73 }
74 
75 static inline int
76 SkipSpace(FILE *s) {
77  int p;
78  while (isascii(p = fgetc(s)) && isspace(p));
79  ungetc(p, s); // Make sure next char is available for reading
80  return p;
81 }
82 
83 static inline void
84 SetBit(unsigned long *bitmap, unsigned int bit) {
85  bitmap[bit/LongBit()] |= 1UL << (bit%LongBit());
86 }
87 
88 static inline int
89 TestBit(unsigned long *bitmap, unsigned int bit) {
90  return static_cast<int>(bitmap[bit/LongBit()] >> (bit%LongBit())) & 1;
91 }
92 
93 static inline int DigitValue(int ch, int base) {
94  if (ch >= '0' && ch <= '9') {
95  if (base >= 10 || ch <= '7')
96  return ch-'0';
97  } else if (ch >= 'A' && ch <= 'Z' && base == 16) {
98  return ch-'A'+10;
99  } else if (ch >= 'a' && ch <= 'z' && base == 16) {
100  return ch-'a'+10;
101  }
102  return -1;
103 }
104 
105 // IO (re-)implementations -----------------------------------------------------
106 static uintmax_t streamtoumax(FILE* s, int base) {
107  int minus = 0;
108  uintmax_t v = 0;
109  int d, c = 0;
110 
111  for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s));
112 
113  // Single optional + or -
114  if (c == '-' || c == '+') {
115  minus = (c == '-');
116  c = fgetc(s);
117  }
118 
119  // Assign correct base
120  if (base == 0) {
121  if (c == '0') {
122  c = fgetc(s);
123  if (c == 'x' || c == 'X') {
124  base = 16;
125  c = fgetc(s);
126  } else {
127  base = 8;
128  }
129  }
130  } else if (base == 16) {
131  if (c == '0') {
132  c = fgetc(s);
133  if (c == 'x' || c == 'X') c = fgetc(s);
134  }
135  }
136 
137  // Actual number parsing
138  for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s))
139  v = v*base + d;
140 
141  ungetc(c, s);
142  return minus ? -v : v;
143 }
144 
145 static double streamtofloat(FILE* s) {
146  int minus = 0;
147  int v = 0;
148  int d, c = 0;
149  int k = 1;
150  int w = 0;
151 
152  for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s));
153 
154  // Single optional + or -
155  if (c == '-' || c == '+') {
156  minus = (c == '-');
157  c = fgetc(s);
158  }
159 
160  // Actual number parsing
161  for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s))
162  v = v*10 + d;
163  if (c == '.') {
164  for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
165  w = w*10 + d;
166  k *= 10;
167  }
168  }
169  double f = static_cast<double>(v)
170  + static_cast<double>(w) / static_cast<double>(k);
171  if (c == 'e' || c == 'E') {
172  c = fgetc(s);
173  int expsign = 1;
174  if (c == '-' || c == '+') {
175  expsign = (c == '-') ? -1 : 1;
176  c = fgetc(s);
177  }
178  int exponent = 0;
179  for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
180  exponent = exponent * 10 + d;
181  }
182  exponent *= expsign;
183  f *= pow(10.0, static_cast<double>(exponent));
184  }
185  ungetc(c, s);
186 
187  return minus ? -f : f;
188 }
189 
190 static int tvfscanf(FILE* stream, const char *format, va_list ap);
191 
192 int tfscanf(FILE* stream, const char *format, ...) {
193  va_list ap;
194  int rv;
195 
196  va_start(ap, format);
197  rv = tvfscanf(stream, format, ap);
198  va_end(ap);
199 
200  return rv;
201 }
202 
203 #ifdef EMBEDDED
204 
205 int fscanf(FILE* stream, const char *format, ...) {
206  va_list ap;
207  int rv;
208 
209  va_start(ap, format);
210  rv = tvfscanf(stream, format, ap);
211  va_end(ap);
212 
213  return rv;
214 }
215 
216 int vfscanf(FILE* stream, const char *format, ...) {
217  va_list ap;
218  int rv;
219 
220  va_start(ap, format);
221  rv = tvfscanf(stream, format, ap);
222  va_end(ap);
223 
224  return rv;
225 }
226 #endif
227 
228 static int tvfscanf(FILE* stream, const char *format, va_list ap) {
229  const char *p = format;
230  char ch;
231  int q = 0;
232  uintmax_t val = 0;
233  int rank = RANK_INT; // Default rank
234  unsigned int width = UINT_MAX;
235  int base;
236  int flags = 0;
237  enum {
238  ST_NORMAL, // Ground state
239  ST_FLAGS, // Special flags
240  ST_WIDTH, // Field width
241  ST_MODIFIERS, // Length or conversion modifiers
242  ST_MATCH_INIT, // Initial state of %[ sequence
243  ST_MATCH, // Main state of %[ sequence
244  ST_MATCH_RANGE, // After - in a %[ sequence
245  } state = ST_NORMAL;
246  char *sarg = nullptr; // %s %c or %[ string argument
247  enum Bail bail = BAIL_NONE;
248  int converted = 0; // Successful conversions
249  unsigned long matchmap[((1 << CHAR_BIT)+(CHAR_BIT * sizeof(long) - 1)) /
250  (CHAR_BIT * sizeof(long))];
251  int matchinv = 0; // Is match map inverted?
252  unsigned char range_start = 0;
253  off_t start_off = ftell(stream);
254 
255  // Skip leading spaces
256  SkipSpace(stream);
257 
258  while ((ch = *p++) && !bail) {
259  switch (state) {
260  case ST_NORMAL:
261  if (ch == '%') {
262  state = ST_FLAGS;
263  flags = 0; rank = RANK_INT; width = UINT_MAX;
264  } else if (isascii(ch) && isspace(ch)) {
265  SkipSpace(stream);
266  } else {
267  if (fgetc(stream) != ch)
268  bail = BAIL_ERR; // Match failure
269  }
270  break;
271 
272  case ST_FLAGS:
273  if (ch == '*') {
274  flags |= FL_SPLAT;
275  } else if ('0' <= ch && ch <= '9') {
276  width = (ch-'0');
277  state = ST_WIDTH;
278  flags |= FL_WIDTH;
279  } else {
280  state = ST_MODIFIERS;
281  p--; // Process this character again
282  }
283  break;
284 
285  case ST_WIDTH:
286  if (ch >= '0' && ch <= '9') {
287  width = width*10+(ch-'0');
288  } else {
289  state = ST_MODIFIERS;
290  p--; // Process this character again
291  }
292  break;
293 
294  case ST_MODIFIERS:
295  switch (ch) {
296  // Length modifiers - nonterminal sequences
297  case 'h':
298  rank--; // Shorter rank
299  break;
300  case 'l':
301  rank++; // Longer rank
302  break;
303  case 'j':
304  rank = kIntMaxRank;
305  break;
306  case 'z':
307  rank = kSizeTRank;
308  break;
309  case 't':
310  rank = kPtrDiffRank;
311  break;
312  case 'L':
313  case 'q':
314  rank = RANK_LONGLONG; // long double/long long
315  break;
316 
317  default:
318  // Output modifiers - terminal sequences
319  state = ST_NORMAL; // Next state will be normal
320  if (rank < kMinRank) // Canonicalize rank
321  rank = kMinRank;
322  else if (rank > kMaxRank)
323  rank = kMaxRank;
324 
325  switch (ch) {
326  case 'P': // Upper case pointer
327  case 'p': // Pointer
328  rank = RANK_PTR;
329  base = 0;
330  goto scan_int;
331 
332  case 'i': // Base-independent integer
333  base = 0;
334  goto scan_int;
335 
336  case 'd': // Decimal integer
337  base = 10;
338  goto scan_int;
339 
340  case 'o': // Octal integer
341  base = 8;
342  goto scan_int;
343 
344  case 'u': // Unsigned decimal integer
345  base = 10;
346  goto scan_int;
347 
348  case 'x': // Hexadecimal integer
349  case 'X':
350  base = 16;
351  goto scan_int;
352 
353  case 'n': // Number of characters consumed
354  val = ftell(stream) - start_off;
355  goto set_integer;
356 
357  scan_int:
358  q = SkipSpace(stream);
359  if (q <= 0) {
360  bail = BAIL_EOF;
361  break;
362  }
363  val = streamtoumax(stream, base);
364  // fall through
365 
366  set_integer:
367  if (!(flags & FL_SPLAT)) {
368  converted++;
369  switch(rank) {
370  case RANK_CHAR:
371  *va_arg(ap, unsigned char *)
372  = static_cast<unsigned char>(val);
373  break;
374  case RANK_SHORT:
375  *va_arg(ap, unsigned short *)
376  = static_cast<unsigned short>(val);
377  break;
378  case RANK_INT:
379  *va_arg(ap, unsigned int *)
380  = static_cast<unsigned int>(val);
381  break;
382  case RANK_LONG:
383  *va_arg(ap, unsigned long *)
384  = static_cast<unsigned long>(val);
385  break;
386  case RANK_LONGLONG:
387  *va_arg(ap, unsigned long long *)
388  = static_cast<unsigned long long>(val);
389  break;
390  case RANK_PTR:
391  *va_arg(ap, void **)
392  = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
393  break;
394  }
395  }
396  break;
397 
398  case 'f': // Preliminary float value parsing
399  case 'g':
400  case 'G':
401  case 'e':
402  case 'E':
403  q = SkipSpace(stream);
404  if (q <= 0) {
405  bail = BAIL_EOF;
406  break;
407  }
408 
409  {
410  double fval = streamtofloat(stream);
411  if (!(flags & FL_SPLAT)) {
412  if (rank == RANK_INT)
413  *va_arg(ap, float *) = static_cast<float>(fval);
414  else if (rank == RANK_LONG)
415  *va_arg(ap, double *) = static_cast<double>(fval);
416  converted++;
417  }
418  }
419  break;
420 
421  case 'c': // Character
422  width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
423  sarg = va_arg(ap, char *);
424  while (width--) {
425  if ((q = fgetc(stream)) <= 0) {
426  bail = BAIL_EOF;
427  break;
428  }
429  if (!(flags & FL_SPLAT)) {
430  *sarg++ = q;
431  converted++;
432  }
433  }
434  break;
435 
436  case 's': // String
437  {
438  if (!(flags & FL_SPLAT)) {
439  sarg = va_arg(ap, char *);
440  }
441  unsigned length = 0;
442  while (width--) {
443  q = fgetc(stream);
444  if ((isascii(q) && isspace(q)) || (q <= 0)) {
445  ungetc(q, stream);
446  break;
447  }
448  if (!(flags & FL_SPLAT)) {
449  sarg[length] = q;
450  }
451  length++;
452  }
453  if (length == 0) {
454  bail = BAIL_EOF;
455  } else if (!(flags & FL_SPLAT)) {
456  sarg[length] = '\0'; // Terminate output
457  converted++;
458  }
459  }
460  break;
461 
462  case '[': // Character range
463  sarg = va_arg(ap, char *);
464  state = ST_MATCH_INIT;
465  matchinv = 0;
466  memset(matchmap, 0, sizeof matchmap);
467  break;
468 
469  case '%': // %% sequence
470  if (fgetc(stream) != '%')
471  bail = BAIL_ERR;
472  break;
473 
474  default: // Anything else
475  bail = BAIL_ERR; // Unknown sequence
476  break;
477  }
478  }
479  break;
480 
481  case ST_MATCH_INIT: // Initial state for %[ match
482  if (ch == '^' && !(flags & FL_INV)) {
483  matchinv = 1;
484  } else {
485  SetBit(matchmap, static_cast<unsigned char>(ch));
486  state = ST_MATCH;
487  }
488  break;
489 
490  case ST_MATCH: // Main state for %[ match
491  if (ch == ']') {
492  goto match_run;
493  } else if (ch == '-') {
494  range_start = static_cast<unsigned char>(ch);
495  state = ST_MATCH_RANGE;
496  } else {
497  SetBit(matchmap, static_cast<unsigned char>(ch));
498  }
499  break;
500 
501  case ST_MATCH_RANGE: // %[ match after -
502  if (ch == ']') {
503  SetBit(matchmap, static_cast<unsigned char>('-'));
504  goto match_run;
505  } else {
506  int i;
507  for (i = range_start ; i < (static_cast<unsigned char>(ch)) ; i++)
508  SetBit(matchmap, i);
509  state = ST_MATCH;
510  }
511  break;
512 
513  match_run: // Match expression finished
514  char* oarg = sarg;
515  while (width) {
516  q = fgetc(stream);
517  unsigned char qc = static_cast<unsigned char>(q);
518  if (q <= 0 || !(TestBit(matchmap, qc)^matchinv)) {
519  ungetc(q, stream);
520  break;
521  }
522  if (!(flags & FL_SPLAT)) *sarg = q;
523  sarg++;
524  }
525  if (oarg == sarg) {
526  bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
527  } else if (!(flags & FL_SPLAT)) {
528  *sarg = '\0';
529  converted++;
530  }
531  break;
532  }
533  }
534 
535  if (bail == BAIL_EOF && !converted)
536  converted = -1; // Return EOF (-1)
537 
538  return converted;
539 }
540 
541 #ifdef EMBEDDED
542 int creat(const char *pathname, mode_t mode) {
543  return open(pathname, O_CREAT | O_TRUNC | O_WRONLY, mode);
544 }
545 
546 #endif // EMBEDDED
enum Ranks kMinRank
Definition: scanutils.cpp:57
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:192
size_t LongBit()
Definition: scanutils.cpp:71
enum Ranks kIntMaxRank
Definition: scanutils.cpp:60
Bail
Definition: scanutils.cpp:64
enum Ranks kMaxRank
Definition: scanutils.cpp:58
Ranks
Definition: scanutils.cpp:48
enum Ranks kPtrDiffRank
Definition: scanutils.cpp:62
enum Ranks kSizeTRank
Definition: scanutils.cpp:61
Flags
Definition: scanutils.cpp:41