tesseract  5.0.0-alpha-619-ge9db
scanutils.cpp
Go to the documentation of this file.
1 // Copyright 2006 Google Inc.
2 // All Rights Reserved.
3 // Author: renn
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License");
6 // you may not use this file except in compliance with the License.
7 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifdef HAVE_CONFIG_H
16 #include "config_auto.h"
17 #endif
18 
19 #include <cctype>
20 #include <climits> // for CHAR_BIT
21 #include <cmath>
22 #include <cstdarg>
23 #include <cstddef>
24 #include <cstdint>
25 #include <cstdio>
26 #include <cstring>
27 #include <limits> // for std::numeric_limits
28 
29 #include "scanutils.h"
30 
31 enum Flags {
32  FL_SPLAT = 0x01, // Drop the value, do not assign
33  FL_INV = 0x02, // Character-set with inverse
34  FL_WIDTH = 0x04, // Field width specified
35  FL_MINUS = 0x08, // Negative number
36 };
37 
38 enum Ranks {
39  RANK_CHAR = -2,
40  RANK_SHORT = -1,
41  RANK_INT = 0,
42  RANK_LONG = 1,
44  RANK_PTR = std::numeric_limits<int>::max() // Special value used for pointers
45 };
46 
47 const enum Ranks kMinRank = RANK_CHAR;
49 
51 const enum Ranks kSizeTRank = RANK_LONG;
53 
54 enum Bail {
55  BAIL_NONE = 0, // No error condition
56  BAIL_EOF, // Hit EOF
57  BAIL_ERR // Conversion mismatch
58 };
59 
60 // Helper functions ------------------------------------------------------------
61 inline size_t LongBit() {
62  return CHAR_BIT * sizeof(long);
63 }
64 
65 static inline int
66 SkipSpace(FILE *s) {
67  int p;
68  while (isascii(p = fgetc(s)) && isspace(p));
69  ungetc(p, s); // Make sure next char is available for reading
70  return p;
71 }
72 
73 static inline void
74 SetBit(unsigned long *bitmap, unsigned int bit) {
75  bitmap[bit/LongBit()] |= 1UL << (bit%LongBit());
76 }
77 
78 static inline int
79 TestBit(unsigned long *bitmap, unsigned int bit) {
80  return static_cast<int>(bitmap[bit/LongBit()] >> (bit%LongBit())) & 1;
81 }
82 
83 static inline int DigitValue(int ch, int base) {
84  if (ch >= '0' && ch <= '9') {
85  if (base >= 10 || ch <= '7')
86  return ch-'0';
87  } else if (ch >= 'A' && ch <= 'Z' && base == 16) {
88  return ch-'A'+10;
89  } else if (ch >= 'a' && ch <= 'z' && base == 16) {
90  return ch-'a'+10;
91  }
92  return -1;
93 }
94 
95 // IO (re-)implementations -----------------------------------------------------
96 static uintmax_t streamtoumax(FILE* s, int base) {
97  int minus = 0;
98  uintmax_t v = 0;
99  int d, c = 0;
100 
101  for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s));
102 
103  // Single optional + or -
104  if (c == '-' || c == '+') {
105  minus = (c == '-');
106  c = fgetc(s);
107  }
108 
109  // Assign correct base
110  if (base == 0) {
111  if (c == '0') {
112  c = fgetc(s);
113  if (c == 'x' || c == 'X') {
114  base = 16;
115  c = fgetc(s);
116  } else {
117  base = 8;
118  }
119  }
120  } else if (base == 16) {
121  if (c == '0') {
122  c = fgetc(s);
123  if (c == 'x' || c == 'X') c = fgetc(s);
124  }
125  }
126 
127  // Actual number parsing
128  for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s))
129  v = v*base + d;
130 
131  ungetc(c, s);
132  return minus ? -v : v;
133 }
134 
135 static double streamtofloat(FILE* s) {
136  bool minus = false;
137  uint64_t v = 0;
138  int d, c;
139  uint64_t k = 1;
140  uint64_t w = 0;
141 
142  for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s));
143 
144  // Single optional + or -
145  if (c == '-' || c == '+') {
146  minus = (c == '-');
147  c = fgetc(s);
148  }
149 
150  // Actual number parsing
151  for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s))
152  v = v*10 + d;
153  if (c == '.') {
154  for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
155  w = w*10 + d;
156  k *= 10;
157  }
158  }
159  double f = v + static_cast<double>(w) / k;
160  if (c == 'e' || c == 'E') {
161  c = fgetc(s);
162  int expsign = 1;
163  if (c == '-' || c == '+') {
164  expsign = (c == '-') ? -1 : 1;
165  c = fgetc(s);
166  }
167  int exponent = 0;
168  for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
169  exponent = exponent * 10 + d;
170  }
171  exponent *= expsign;
172  f *= pow(10.0, static_cast<double>(exponent));
173  }
174  ungetc(c, s);
175 
176  return minus ? -f : f;
177 }
178 
179 static int tvfscanf(FILE* stream, const char *format, va_list ap);
180 
181 int tfscanf(FILE* stream, const char *format, ...) {
182  va_list ap;
183  int rv;
184 
185  va_start(ap, format);
186  rv = tvfscanf(stream, format, ap);
187  va_end(ap);
188 
189  return rv;
190 }
191 
192 static int tvfscanf(FILE* stream, const char *format, va_list ap) {
193  const char *p = format;
194  char ch;
195  int q = 0;
196  uintmax_t val = 0;
197  int rank = RANK_INT; // Default rank
198  unsigned int width = UINT_MAX;
199  int base;
200  int flags = 0;
201  enum {
202  ST_NORMAL, // Ground state
203  ST_FLAGS, // Special flags
204  ST_WIDTH, // Field width
205  ST_MODIFIERS, // Length or conversion modifiers
206  ST_MATCH_INIT, // Initial state of %[ sequence
207  ST_MATCH, // Main state of %[ sequence
208  ST_MATCH_RANGE, // After - in a %[ sequence
209  } state = ST_NORMAL;
210  char *sarg = nullptr; // %s %c or %[ string argument
211  enum Bail bail = BAIL_NONE;
212  int converted = 0; // Successful conversions
213  unsigned long matchmap[((1 << CHAR_BIT)+(CHAR_BIT * sizeof(long) - 1)) /
214  (CHAR_BIT * sizeof(long))];
215  int matchinv = 0; // Is match map inverted?
216  unsigned char range_start = 0;
217  auto start_off = std::ftell(stream);
218 
219  // Skip leading spaces
220  SkipSpace(stream);
221 
222  while ((ch = *p++) && !bail) {
223  switch (state) {
224  case ST_NORMAL:
225  if (ch == '%') {
226  state = ST_FLAGS;
227  flags = 0; rank = RANK_INT; width = UINT_MAX;
228  } else if (isascii(ch) && isspace(ch)) {
229  SkipSpace(stream);
230  } else {
231  if (fgetc(stream) != ch)
232  bail = BAIL_ERR; // Match failure
233  }
234  break;
235 
236  case ST_FLAGS:
237  if (ch == '*') {
238  flags |= FL_SPLAT;
239  } else if ('0' <= ch && ch <= '9') {
240  width = (ch-'0');
241  state = ST_WIDTH;
242  flags |= FL_WIDTH;
243  } else {
244  state = ST_MODIFIERS;
245  p--; // Process this character again
246  }
247  break;
248 
249  case ST_WIDTH:
250  if (ch >= '0' && ch <= '9') {
251  width = width*10+(ch-'0');
252  } else {
253  state = ST_MODIFIERS;
254  p--; // Process this character again
255  }
256  break;
257 
258  case ST_MODIFIERS:
259  switch (ch) {
260  // Length modifiers - nonterminal sequences
261  case 'h':
262  rank--; // Shorter rank
263  break;
264  case 'l':
265  rank++; // Longer rank
266  break;
267  case 'j':
268  rank = kIntMaxRank;
269  break;
270  case 'z':
271  rank = kSizeTRank;
272  break;
273  case 't':
274  rank = kPtrDiffRank;
275  break;
276  case 'L':
277  case 'q':
278  rank = RANK_LONGLONG; // long double/long long
279  break;
280 
281  default:
282  // Output modifiers - terminal sequences
283  state = ST_NORMAL; // Next state will be normal
284  if (rank < kMinRank) // Canonicalize rank
285  rank = kMinRank;
286  else if (rank > kMaxRank)
287  rank = kMaxRank;
288 
289  switch (ch) {
290  case 'P': // Upper case pointer
291  case 'p': // Pointer
292  rank = RANK_PTR;
293  base = 0;
294  goto scan_int;
295 
296  case 'i': // Base-independent integer
297  base = 0;
298  goto scan_int;
299 
300  case 'd': // Decimal integer
301  base = 10;
302  goto scan_int;
303 
304  case 'o': // Octal integer
305  base = 8;
306  goto scan_int;
307 
308  case 'u': // Unsigned decimal integer
309  base = 10;
310  goto scan_int;
311 
312  case 'x': // Hexadecimal integer
313  case 'X':
314  base = 16;
315  goto scan_int;
316 
317  case 'n': // Number of characters consumed
318  val = std::ftell(stream) - start_off;
319  goto set_integer;
320 
321  scan_int:
322  q = SkipSpace(stream);
323  if (q <= 0) {
324  bail = BAIL_EOF;
325  break;
326  }
327  val = streamtoumax(stream, base);
328  // fall through
329 
330  set_integer:
331  if (!(flags & FL_SPLAT)) {
332  converted++;
333  switch(rank) {
334  case RANK_CHAR:
335  *va_arg(ap, unsigned char *)
336  = static_cast<unsigned char>(val);
337  break;
338  case RANK_SHORT:
339  *va_arg(ap, unsigned short *)
340  = static_cast<unsigned short>(val);
341  break;
342  case RANK_INT:
343  *va_arg(ap, unsigned int *)
344  = static_cast<unsigned int>(val);
345  break;
346  case RANK_LONG:
347  *va_arg(ap, unsigned long *)
348  = static_cast<unsigned long>(val);
349  break;
350  case RANK_LONGLONG:
351  *va_arg(ap, unsigned long long *)
352  = static_cast<unsigned long long>(val);
353  break;
354  case RANK_PTR:
355  *va_arg(ap, void **)
356  = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
357  break;
358  }
359  }
360  break;
361 
362  case 'f': // Preliminary float value parsing
363  case 'g':
364  case 'G':
365  case 'e':
366  case 'E':
367  q = SkipSpace(stream);
368  if (q <= 0) {
369  bail = BAIL_EOF;
370  break;
371  }
372 
373  {
374  double fval = streamtofloat(stream);
375  if (!(flags & FL_SPLAT)) {
376  if (rank == RANK_INT)
377  *va_arg(ap, float *) = static_cast<float>(fval);
378  else if (rank == RANK_LONG)
379  *va_arg(ap, double *) = static_cast<double>(fval);
380  converted++;
381  }
382  }
383  break;
384 
385  case 'c': // Character
386  width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
387  sarg = va_arg(ap, char *);
388  while (width--) {
389  if ((q = fgetc(stream)) <= 0) {
390  bail = BAIL_EOF;
391  break;
392  }
393  if (!(flags & FL_SPLAT)) {
394  *sarg++ = q;
395  converted++;
396  }
397  }
398  break;
399 
400  case 's': // String
401  {
402  if (!(flags & FL_SPLAT)) {
403  sarg = va_arg(ap, char *);
404  }
405  unsigned length = 0;
406  while (width--) {
407  q = fgetc(stream);
408  if ((isascii(q) && isspace(q)) || (q <= 0)) {
409  ungetc(q, stream);
410  break;
411  }
412  if (!(flags & FL_SPLAT)) {
413  sarg[length] = q;
414  }
415  length++;
416  }
417  if (length == 0) {
418  bail = BAIL_EOF;
419  } else if (!(flags & FL_SPLAT)) {
420  sarg[length] = '\0'; // Terminate output
421  converted++;
422  }
423  }
424  break;
425 
426  case '[': // Character range
427  sarg = va_arg(ap, char *);
428  state = ST_MATCH_INIT;
429  matchinv = 0;
430  memset(matchmap, 0, sizeof matchmap);
431  break;
432 
433  case '%': // %% sequence
434  if (fgetc(stream) != '%')
435  bail = BAIL_ERR;
436  break;
437 
438  default: // Anything else
439  bail = BAIL_ERR; // Unknown sequence
440  break;
441  }
442  }
443  break;
444 
445  case ST_MATCH_INIT: // Initial state for %[ match
446  if (ch == '^' && !(flags & FL_INV)) {
447  matchinv = 1;
448  } else {
449  SetBit(matchmap, static_cast<unsigned char>(ch));
450  state = ST_MATCH;
451  }
452  break;
453 
454  case ST_MATCH: // Main state for %[ match
455  if (ch == ']') {
456  goto match_run;
457  } else if (ch == '-') {
458  range_start = static_cast<unsigned char>(ch);
459  state = ST_MATCH_RANGE;
460  } else {
461  SetBit(matchmap, static_cast<unsigned char>(ch));
462  }
463  break;
464 
465  case ST_MATCH_RANGE: // %[ match after -
466  if (ch == ']') {
467  SetBit(matchmap, static_cast<unsigned char>('-'));
468  goto match_run;
469  } else {
470  int i;
471  for (i = range_start ; i < (static_cast<unsigned char>(ch)) ; i++)
472  SetBit(matchmap, i);
473  state = ST_MATCH;
474  }
475  break;
476 
477  match_run: // Match expression finished
478  char* oarg = sarg;
479  while (width) {
480  q = fgetc(stream);
481  auto qc = static_cast<unsigned char>(q);
482  if (q <= 0 || !(TestBit(matchmap, qc)^matchinv)) {
483  ungetc(q, stream);
484  break;
485  }
486  if (!(flags & FL_SPLAT)) *sarg = q;
487  sarg++;
488  }
489  if (oarg == sarg) {
490  bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
491  } else if (!(flags & FL_SPLAT)) {
492  *sarg = '\0';
493  converted++;
494  }
495  break;
496  }
497  }
498 
499  if (bail == BAIL_EOF && !converted)
500  converted = -1; // Return EOF (-1)
501 
502  return converted;
503 }
Ranks
Ranks
Definition: scanutils.cpp:38
tfscanf
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:181
LongBit
size_t LongBit()
Definition: scanutils.cpp:61
BAIL_EOF
Definition: scanutils.cpp:56
FL_MINUS
Definition: scanutils.cpp:35
RANK_PTR
Definition: scanutils.cpp:44
Flags
Flags
Definition: scanutils.cpp:31
Bail
Bail
Definition: scanutils.cpp:54
BAIL_ERR
Definition: scanutils.cpp:57
RANK_INT
Definition: scanutils.cpp:41
FL_WIDTH
Definition: scanutils.cpp:34
RANK_CHAR
Definition: scanutils.cpp:39
FL_INV
Definition: scanutils.cpp:33
kPtrDiffRank
enum Ranks kPtrDiffRank
Definition: scanutils.cpp:52
scanutils.h
kIntMaxRank
enum Ranks kIntMaxRank
Definition: scanutils.cpp:50
FL_SPLAT
Definition: scanutils.cpp:32
BAIL_NONE
Definition: scanutils.cpp:55
RANK_LONG
Definition: scanutils.cpp:42
kMinRank
enum Ranks kMinRank
Definition: scanutils.cpp:47
kSizeTRank
enum Ranks kSizeTRank
Definition: scanutils.cpp:51
RANK_SHORT
Definition: scanutils.cpp:40
kMaxRank
enum Ranks kMaxRank
Definition: scanutils.cpp:48
RANK_LONGLONG
Definition: scanutils.cpp:43