30 static int CodepointDistance(
const char* start,
const char* end) {
33 for (
const char* p = start; p < end; ++p) {
34 n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
39 static int CodepointCount(
const char* utf8,
int len) {
40 return CodepointDistance(utf8, utf8 + len);
46 return CodepointDistance(first.it_,
last.it_);
51 static int ConvertToInterchangeValid(
char* start,
int len) {
66 char*
const in = start;
68 char*
const end = start + len;
73 memmove(out, start, good);
102 void UnicodeText::Repr::reserve(
int new_capacity) {
104 if (capacity_ >= new_capacity && ours_)
return;
107 capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
108 char* new_data =
new char[capacity_];
112 memcpy(new_data, data_, size_);
113 if (ours_)
delete[] data_;
120 void UnicodeText::Repr::resize(
int new_size) {
124 if (!ours_ || new_size > capacity_) reserve(new_size);
126 if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
134 void UnicodeText::Repr::clear() {
135 if (ours_)
delete[] data_;
137 size_ = capacity_ = 0;
141 void UnicodeText::Repr::Copy(
const char* data,
int size) {
143 memcpy(data_, data, size);
146 void UnicodeText::Repr::TakeOwnershipOf(
char* data,
int size,
int capacity) {
147 if (data == data_)
return;
148 if (ours_ && data_)
delete[] data_;
151 capacity_ = capacity;
155 void UnicodeText::Repr::PointTo(
const char* data,
int size) {
156 if (ours_ && data_)
delete[] data_;
157 data_ = const_cast<char*>(data);
163 void UnicodeText::Repr::append(
const char* bytes,
int byte_length) {
164 reserve(size_ + byte_length);
165 memcpy(data_ + size_, bytes, byte_length);
166 size_ += byte_length;
169 string UnicodeText::Repr::DebugString()
const {
170 return tensorflow::strings::Printf(
"{Repr %p data=%p size=%d capacity=%d %s}",
172 data_, size_, capacity_,
173 ours_ ?
"Owned" :
"Alias");
194 CHECK(first <=
last) <<
" Incompatible iterators";
195 repr_.append(first.it_,
last.it_ - first.it_);
200 CHECK(first <=
last) <<
" Incompatible iterators";
201 return string(first.it_,
last.it_ - first.it_);
215 repr_.Copy(src.repr_.data_, src.repr_.size_);
220 repr_.Copy(buffer, byte_length);
222 LOG(
WARNING) <<
"UTF-8 buffer is not interchange-valid.";
223 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
228 UnicodeText& UnicodeText::UnsafeCopyUTF8(
const char* buffer,
230 repr_.Copy(buffer, byte_length);
239 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
241 LOG(
WARNING) <<
"UTF-8 buffer is not interchange-valid.";
242 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
247 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(
char* buffer,
250 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
258 repr_.PointTo(buffer, byte_length);
260 LOG(
WARNING) <<
"UTF-8 buffer is not interchange-valid.";
261 repr_.Copy(buffer, byte_length);
262 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
267 UnicodeText& UnicodeText::UnsafePointToUTF8(
const char* buffer,
269 repr_.PointTo(buffer, byte_length);
274 repr_.PointTo(src.repr_.data_, src.repr_.size_);
280 CHECK(first <=
last) <<
" Incompatible iterators";
288 repr_.append(u.repr_.data_, u.repr_.size_);
294 CHECK(first <=
last) <<
" Incompatible iterators";
295 repr_.append(first.it_,
last.it_ - first.it_);
299 UnicodeText& UnicodeText::UnsafeAppendUTF8(
const char* utf8,
int len) {
300 repr_.append(utf8, len);
310 return UnsafeFind(look, start_pos);
314 return UnsafeFind(look,
begin());
318 const UnicodeText& look, const_iterator start_pos)
const {
326 StringPiece::size_type found = StringPiece::npos;
327 if (found == StringPiece::npos)
return end();
337 StringPiece looking_for(
"\xEF\xBF\xBD", 3);
359 repr_.append(buf, len);
361 LOG(
WARNING) <<
"Unicode value 0x" << std::hex << c
362 <<
" is not valid for interchange";
363 repr_.append(
" ", 1);
366 LOG(
WARNING) <<
"Illegal Unicode value: 0x" << std::hex << c;
367 repr_.append(
" ", 1);
372 return CodepointCount(repr_.data_, repr_.size_);
376 if (&lhs == &rhs)
return true;
377 if (lhs.repr_.size_ != rhs.repr_.size_)
return false;
378 return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
382 return tensorflow::strings::Printf(
"{UnicodeText %p chars=%d repr=%s}",
385 repr_.DebugString().c_str());
418 return lhs.it_ < rhs.it_;
428 unsigned char byte1 = it_[0];
432 unsigned char byte2 = it_[1];
434 return ((byte1 & 0x1F) << 6)
437 unsigned char byte3 = it_[2];
439 return ((byte1 & 0x0F) << 12)
440 | ((byte2 & 0x3F) << 6)
443 unsigned char byte4 = it_[3];
444 return ((byte1 & 0x07) << 18)
445 | ((byte2 & 0x3F) << 12)
446 | ((byte3 & 0x3F) << 6)
461 utf8_output[0] = it_[0];
if ((it_[0] & 0xff) < 0x80)
return 1;
462 utf8_output[1] = it_[1];
if ((it_[0] & 0xff) < 0xE0)
return 2;
463 utf8_output[2] = it_[2];
if ((it_[0] & 0xff) < 0xF0)
return 3;
464 utf8_output[3] = it_[3];
473 if ((it_[0] & 0xff) < 0x80) {
475 }
else if ((it_[0] & 0xff) < 0xE0) {
477 }
else if ((it_[0] & 0xff) < 0xF0) {
488 const char*
end = start + len;
496 return tensorflow::strings::Printf(
"{iter %p}", it_);
505 while (it !=
end) tensorflow::strings::Appendf(&s,
"%X ", *it++);