tesseract
5.0.0-alpha-619-ge9db
unilib.h
Go to the documentation of this file.
1
17
// Routines to do manipulation of Unicode characters or text
18
//
19
// The StructurallyValid routines accept buffers of arbitrary bytes.
20
// For CoerceToStructurallyValid(), the input buffer and output buffers may
21
// point to exactly the same memory.
22
//
23
// In all other cases, the UTF-8 string must be structurally valid and
24
// have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF.
25
// Debug builds take a fatal error for invalid UTF-8 input.
26
// The input and output buffers may not overlap at all.
27
//
28
// The char32 routines are here only for convenience; they convert to UTF-8
29
// internally and use the UTF-8 routines.
30
31
#ifndef UTIL_UTF8_UNILIB_H__
32
#define UTIL_UTF8_UNILIB_H__
33
34
#include <string>
35
#include "
syntaxnet/base.h
"
36
37
// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here,
38
// but they are defined in unilib_utf8_utils.h.
39
//#include "util/utf8/public/unilib_utf8_utils.h" // IWYU pragma: export
40
41
namespace
UniLib
{
42
43
// Returns the length in bytes of the prefix of src that is all
44
// interchange valid UTF-8
45
int
SpanInterchangeValid
(
const
char
* src,
int
byte_length);
46
inline
int
SpanInterchangeValid
(
const
std::string
& src) {
47
return
SpanInterchangeValid
(src.data(), src.size());
48
}
49
50
// Returns true if the source is all interchange valid UTF-8
51
// "Interchange valid" is a stronger than structurally valid --
52
// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
53
bool
IsInterchangeValid
(
char32
codepoint);
54
inline
bool
IsInterchangeValid
(
const
char
* src,
int
byte_length) {
55
return
(byte_length ==
SpanInterchangeValid
(src, byte_length));
56
}
57
inline
bool
IsInterchangeValid
(
const
std::string
& src) {
58
return
IsInterchangeValid
(src.data(), src.size());
59
}
60
61
}
// namespace UniLib
62
63
#endif // UTIL_UTF8_PUBLIC_UNILIB_H_
string
std::string string
Definition:
equationdetect_test.cc:21
base.h
UniLib::IsInterchangeValid
bool IsInterchangeValid(char32 c)
Definition:
unilib.cc:33
UniLib::SpanInterchangeValid
int SpanInterchangeValid(const char *begin, int byte_length)
Definition:
unilib.cc:40
char32
signed int char32
Definition:
pango_font_info.h:33
UniLib
Definition:
unilib.cc:24
unittest
util
utf8
unilib.h
Generated on Thu Jan 30 2020 14:22:22 for tesseract by
1.8.16