tesseract  5.0.0-alpha-619-ge9db
unilib.h
Go to the documentation of this file.
1 
17 // Routines to do manipulation of Unicode characters or text
18 //
19 // The StructurallyValid routines accept buffers of arbitrary bytes.
20 // For CoerceToStructurallyValid(), the input buffer and output buffers may
21 // point to exactly the same memory.
22 //
23 // In all other cases, the UTF-8 string must be structurally valid and
24 // have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF.
25 // Debug builds take a fatal error for invalid UTF-8 input.
26 // The input and output buffers may not overlap at all.
27 //
28 // The char32 routines are here only for convenience; they convert to UTF-8
29 // internally and use the UTF-8 routines.
30 
31 #ifndef UTIL_UTF8_UNILIB_H__
32 #define UTIL_UTF8_UNILIB_H__
33 
34 #include <string>
35 #include "syntaxnet/base.h"
36 
37 // We export OneCharLen, IsValidCodepoint, and IsTrailByte from here,
38 // but they are defined in unilib_utf8_utils.h.
39 //#include "util/utf8/public/unilib_utf8_utils.h" // IWYU pragma: export
40 
41 namespace UniLib {
42 
43 // Returns the length in bytes of the prefix of src that is all
44 // interchange valid UTF-8
45 int SpanInterchangeValid(const char* src, int byte_length);
46 inline int SpanInterchangeValid(const std::string& src) {
47  return SpanInterchangeValid(src.data(), src.size());
48 }
49 
50 // Returns true if the source is all interchange valid UTF-8
51 // "Interchange valid" is a stronger than structurally valid --
52 // no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
53 bool IsInterchangeValid(char32 codepoint);
54 inline bool IsInterchangeValid(const char* src, int byte_length) {
55  return (byte_length == SpanInterchangeValid(src, byte_length));
56 }
57 inline bool IsInterchangeValid(const std::string& src) {
58  return IsInterchangeValid(src.data(), src.size());
59 }
60 
61 } // namespace UniLib
62 
63 #endif // UTIL_UTF8_PUBLIC_UNILIB_H_
string
std::string string
Definition: equationdetect_test.cc:21
base.h
UniLib::IsInterchangeValid
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33
UniLib::SpanInterchangeValid
int SpanInterchangeValid(const char *begin, int byte_length)
Definition: unilib.cc:40
char32
signed int char32
Definition: pango_font_info.h:33
UniLib
Definition: unilib.cc:24