tesseract  5.0.0-alpha-619-ge9db
unilib.cc
Go to the documentation of this file.
1 
17 // Author: sligocki@google.com (Shawn Ligocki)
18 
19 #include "util/utf8/unilib.h"
20 
21 #include "syntaxnet/base.h"
22 #include "third_party/utf/utf.h"
23 
24 namespace UniLib {
25 
26 // Codepoints not allowed for interchange are:
27 // C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020),
28 // Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A),
29 // Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D)
30 // C1 controls: U+007F to U+009F
31 // Surrogates: U+D800 to U+DFFF
32 // Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx
34  return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
35  (c >= 0x7F && c <= 0x9F) ||
36  (c >= 0xD800 && c <= 0xDFFF) ||
37  (c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE);
38 }
39 
40 int SpanInterchangeValid(const char* begin, int byte_length) {
41  char32 rune;
42  const char* p = begin;
43  const char* end = begin + byte_length;
44  while (p < end) {
45  int bytes_consumed = charntorune(&rune, p, end - p);
46  // We want to accept Runeerror == U+FFFD as a valid char, but it is used
47  // by chartorune to indicate error. Luckily, the real codepoint is size 3
48  // while errors return bytes_consumed <= 1.
49  if ((rune == Runeerror && bytes_consumed <= 1) ||
50  !IsInterchangeValid(rune)) {
51  break; // Found
52  }
53  p += bytes_consumed;
54  }
55  return p - begin;
56 }
57 
58 } // namespace UniLib
charntorune
int charntorune(Rune *rune, const char *str, int length)
Definition: rune.c:66
Runeerror
Definition: utf.h:26
base.h
UniLib::IsInterchangeValid
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33
UniLib::SpanInterchangeValid
int SpanInterchangeValid(const char *begin, int byte_length)
Definition: unilib.cc:40
char32
signed int char32
Definition: pango_font_info.h:33
UniLib
Definition: unilib.cc:24
unilib.h
utf.h