tesseract  5.0.0-alpha-619-ge9db
UniLib Namespace Reference

Functions

bool IsInterchangeValid (char32 c)
 
int SpanInterchangeValid (const char *begin, int byte_length)
 
int SpanInterchangeValid (const std::string &src)
 
bool IsInterchangeValid (const char *src, int byte_length)
 
bool IsInterchangeValid (const std::string &src)
 
bool IsValidCodepoint (char32 c)
 
bool IsUTF8ValidCodepoint (StringPiece str)
 
int OneCharLen (const char *src)
 
bool IsTrailByte (char x)
 

Detailed Description

Copyright 2010 Google Inc.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Function Documentation

◆ IsInterchangeValid() [1/3]

bool UniLib::IsInterchangeValid ( char32  c)

Definition at line 33 of file unilib.cc.

33  {
34  return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
35  (c >= 0x7F && c <= 0x9F) ||
36  (c >= 0xD800 && c <= 0xDFFF) ||
37  (c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE);
38 }

◆ IsInterchangeValid() [2/3]

bool UniLib::IsInterchangeValid ( const char *  src,
int  byte_length 
)
inline

Definition at line 54 of file unilib.h.

54  {
55  return (byte_length == SpanInterchangeValid(src, byte_length));
56 }

◆ IsInterchangeValid() [3/3]

bool UniLib::IsInterchangeValid ( const std::string src)
inline

Definition at line 57 of file unilib.h.

57  {
58  return IsInterchangeValid(src.data(), src.size());
59 }

◆ IsTrailByte()

bool UniLib::IsTrailByte ( char  x)
inline

Definition at line 58 of file unilib_utf8_utils.h.

58  {
59  // return (x & 0xC0) == 0x80;
60  // Since trail bytes are always in [0x80, 0xBF], we can optimize:
61  return static_cast<signed char>(x) < -0x40;
62 }

◆ IsUTF8ValidCodepoint()

bool UniLib::IsUTF8ValidCodepoint ( StringPiece  str)
inline

Definition at line 40 of file unilib_utf8_utils.h.

40  {
41  char32 c;
42  int consumed;
43  // It's OK if str.length() > consumed.
44  return !str.empty()
45  && isvalidcharntorune(str.data(), str.size(), &c, &consumed)
46  && IsValidCodepoint(c);
47 }

◆ IsValidCodepoint()

bool UniLib::IsValidCodepoint ( char32  c)
inline

Definition at line 31 of file unilib_utf8_utils.h.

31  {
32  return (static_cast<uint32>(c) < 0xD800)
33  || (c >= 0xE000 && c <= 0x10FFFF);
34 }

◆ OneCharLen()

int UniLib::OneCharLen ( const char *  src)
inline

Definition at line 53 of file unilib_utf8_utils.h.

53  {
54  return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
55 }

◆ SpanInterchangeValid() [1/2]

int UniLib::SpanInterchangeValid ( const char *  begin,
int  byte_length 
)

Definition at line 40 of file unilib.cc.

40  {
41  char32 rune;
42  const char* p = begin;
43  const char* end = begin + byte_length;
44  while (p < end) {
45  int bytes_consumed = charntorune(&rune, p, end - p);
46  // We want to accept Runeerror == U+FFFD as a valid char, but it is used
47  // by chartorune to indicate error. Luckily, the real codepoint is size 3
48  // while errors return bytes_consumed <= 1.
49  if ((rune == Runeerror && bytes_consumed <= 1) ||
50  !IsInterchangeValid(rune)) {
51  break; // Found
52  }
53  p += bytes_consumed;
54  }
55  return p - begin;
56 }

◆ SpanInterchangeValid() [2/2]

int UniLib::SpanInterchangeValid ( const std::string src)
inline

Definition at line 46 of file unilib.h.

46  {
47  return SpanInterchangeValid(src.data(), src.size());
48 }
charntorune
int charntorune(Rune *rune, const char *str, int length)
Definition: rune.c:66
Runeerror
Definition: utf.h:26
UniLib::SpanInterchangeValid
int SpanInterchangeValid(const std::string &src)
Definition: unilib.h:46
UniLib::IsInterchangeValid
bool IsInterchangeValid(const std::string &src)
Definition: unilib.h:57
UniLib::IsInterchangeValid
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33
UniLib::IsValidCodepoint
bool IsValidCodepoint(char32 c)
Definition: unilib_utf8_utils.h:31
char32
signed int char32
Definition: pango_font_info.h:33
isvalidcharntorune
int isvalidcharntorune(const char *str, int length, Rune *rune, int *consumed)
Definition: rune.c:247