tesseract  5.0.0-alpha-619-ge9db
strngs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: strngs.cpp (Formerly strings.c)
3  * Description: STRING class functions.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1991, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include <tesseract/strngs.h>
20 #include <cassert> // for assert
21 #include <cstdlib> // for malloc, free
22 #include <locale> // for std::locale::classic
23 #include <sstream> // for std::stringstream
24 #include "errcode.h" // for ASSERT_HOST
25 #include <tesseract/genericvector.h> // for GenericVector
26 #include <tesseract/helpers.h> // for ReverseN
27 #include <tesseract/serialis.h> // for TFile
28 
29 using tesseract::TFile;
30 
31 // Size of buffer needed to host the decimal representation of the maximum
32 // possible length of an int (in 64 bits), being -<20 digits>.
33 const int kMaxIntSize = 22;
34 
35 /**********************************************************************
36  * STRING_HEADER provides metadata about the allocated buffer,
37  * including total capacity and how much used (strlen with '\0').
38  *
39  * The implementation hides this header at the start of the data
40  * buffer and appends the string on the end to keep sizeof(STRING)
41  * unchanged from earlier versions so serialization is not affected.
42  *
43  * The collection of MACROS provide different implementations depending
44  * on whether the string keeps track of its strlen or not so that this
45  * feature can be added in later when consumers don't modify the string
46  **********************************************************************/
47 
48 // Smallest string to allocate by default
49 const int kMinCapacity = 16;
50 
51 char* STRING::AllocData(int used, int capacity) {
52  data_ = static_cast<STRING_HEADER *>(malloc(capacity + sizeof(STRING_HEADER)));
53 
54  // header is the metadata for this memory block
55  STRING_HEADER* header = GetHeader();
56  header->capacity_ = capacity;
57  header->used_ = used;
58  return GetCStr();
59 }
60 
61 void STRING::DiscardData() {
62  free(data_);
63  data_ = nullptr;
64 }
65 
66 // This is a private method; ensure FixHeader is called (or used_ is well defined)
67 // beforehand
68 char* STRING::ensure_cstr(int32_t min_capacity) {
69  STRING_HEADER* orig_header = GetHeader();
70  if (min_capacity <= orig_header->capacity_)
71  return (reinterpret_cast<char *>(this->data_)) + sizeof(STRING_HEADER);
72 
73  // if we are going to grow bigger, than double our existing
74  // size, but if that still is not big enough then keep the
75  // requested capacity
76  if (min_capacity < 2 * orig_header->capacity_)
77  min_capacity = 2 * orig_header->capacity_;
78 
79  int alloc = sizeof(STRING_HEADER) + min_capacity;
80  auto* new_header = static_cast<STRING_HEADER*>(malloc(alloc));
81 
82  memcpy(&new_header[1], GetCStr(), orig_header->used_);
83  new_header->capacity_ = min_capacity;
84  new_header->used_ = orig_header->used_;
85 
86  // free old memory, then rebind to new memory
87  DiscardData();
88  data_ = new_header;
89 
90  assert(InvariantOk());
91  return (reinterpret_cast<char *>(data_)) + sizeof(STRING_HEADER);
92 }
93 
94 // This is const, but is modifying a mutable field
95 // this way it can be used on const or non-const instances.
96 void STRING::FixHeader() const {
97  const STRING_HEADER* header = GetHeader();
98  if (header->used_ < 0)
99  header->used_ = strlen(GetCStr()) + 1;
100 }
101 
102 
103 STRING::STRING() {
104  // Empty STRINGs contain just the "\0".
105  memcpy(AllocData(1, kMinCapacity), "", 1);
106 }
107 
108 STRING::STRING(const STRING& str) {
109  str.FixHeader();
110  const STRING_HEADER* str_header = str.GetHeader();
111  const int str_used = str_header->used_;
112  char *this_cstr = AllocData(str_used, str_used);
113  memcpy(this_cstr, str.GetCStr(), str_used);
114  assert(InvariantOk());
115 }
116 
117 STRING::STRING(const char* cstr) {
118  if (cstr == nullptr) {
119  // Empty STRINGs contain just the "\0".
120  memcpy(AllocData(1, kMinCapacity), "", 1);
121  } else {
122  const int len = strlen(cstr) + 1;
123  char* this_cstr = AllocData(len, len);
124  memcpy(this_cstr, cstr, len);
125  }
126  assert(InvariantOk());
127 }
128 
129 STRING::STRING(const char *data, int length) {
130  if (data == nullptr) {
131  // Empty STRINGs contain just the "\0".
132  memcpy(AllocData(1, kMinCapacity), "", 1);
133  } else {
134  char* this_cstr = AllocData(length + 1, length + 1);
135  memcpy(this_cstr, data, length);
136  this_cstr[length] = '\0';
137  }
138 }
139 
140 STRING::~STRING() {
141  DiscardData();
142 }
143 
144 // TODO(rays) Change all callers to use TFile and remove the old functions.
145 // Writes to the given file. Returns false in case of error.
146 bool STRING::Serialize(FILE* fp) const {
147  uint32_t len = length();
148  return tesseract::Serialize(fp, &len) &&
149  tesseract::Serialize(fp, GetCStr(), len);
150 }
151 // Writes to the given file. Returns false in case of error.
152 bool STRING::Serialize(TFile* fp) const {
153  uint32_t len = length();
154  return fp->Serialize(&len) &&
155  fp->Serialize(GetCStr(), len);
156 }
157 // Reads from the given file. Returns false in case of error.
158 // If swap is true, assumes a big/little-endian swap is needed.
159 bool STRING::DeSerialize(bool swap, FILE* fp) {
160  uint32_t len;
161  if (!tesseract::DeSerialize(fp, &len)) return false;
162  if (swap)
163  ReverseN(&len, sizeof(len));
164  // Arbitrarily limit the number of characters to protect against bad data.
165  if (len > UINT16_MAX) return false;
166  truncate_at(len);
167  return tesseract::DeSerialize(fp, GetCStr(), len);
168 }
169 // Reads from the given file. Returns false in case of error.
170 // If swap is true, assumes a big/little-endian swap is needed.
171 bool STRING::DeSerialize(TFile* fp) {
172  uint32_t len;
173  if (!fp->DeSerialize(&len)) return false;
174  truncate_at(len);
175  return fp->DeSerialize(GetCStr(), len);
176 }
177 
178 // As DeSerialize, but only seeks past the data - hence a static method.
179 bool STRING::SkipDeSerialize(TFile* fp) {
180  uint32_t len;
181  if (!fp->DeSerialize(&len)) return false;
182  return fp->Skip(len);
183 }
184 
185 bool STRING::contains(const char c) const {
186  return (c != '\0') && (strchr (GetCStr(), c) != nullptr);
187 }
188 
189 int32_t STRING::length() const {
190  FixHeader();
191  return GetHeader()->used_ - 1;
192 }
193 
194 const char* STRING::c_str() const {
195  const STRING_HEADER* header = GetHeader();
196  if (!header || header->used_ == 0)
197  return nullptr;
198 
199  // mark header length unreliable because tesseract might
200  // cast away the const and mutate the string directly.
201  header->used_ = -1;
202  return GetCStr();
203 }
204 
205 /******
206  * The STRING_IS_PROTECTED interface adds additional support to migrate
207  * code that needs to modify the STRING in ways not otherwise supported
208  * without violating encapsulation.
209  *
210  * Also makes the [] operator return a const so it is immutable
211  */
212 #if STRING_IS_PROTECTED
213 const char& STRING::operator[](int32_t index) const {
214  return GetCStr()[index];
215 }
216 
217 void STRING::insert_range(int32_t index, const char* str, int len) {
218  // if index is outside current range, then also grow size of string
219  // to accmodate the requested range.
220  STRING_HEADER* this_header = GetHeader();
221  int used = this_header->used_;
222  if (index > used)
223  used = index;
224 
225  char* this_cstr = ensure_cstr(used + len + 1);
226  if (index < used) {
227  // move existing string from index to '\0' inclusive.
228  memmove(this_cstr + index + len,
229  this_cstr + index,
230  this_header->used_ - index);
231  } else if (len > 0) {
232  // We are going to overwrite previous null terminator, so write the new one.
233  this_cstr[this_header->used_ + len - 1] = '\0';
234 
235  // If the old header did not have the terminator,
236  // then we need to account for it now that we've added it.
237  // Otherwise it was already accounted for; we just moved it.
238  if (this_header->used_ == 0)
239  ++this_header->used_;
240  }
241 
242  // Write new string to index.
243  // The string is already terminated from the conditions above.
244  memcpy(this_cstr + index, str, len);
245  this_header->used_ += len;
246 
247  assert(InvariantOk());
248 }
249 
250 void STRING::erase_range(int32_t index, int len) {
251  char* this_cstr = GetCStr();
252  STRING_HEADER* this_header = GetHeader();
253 
254  memcpy(this_cstr+index, this_cstr+index+len,
255  this_header->used_ - index - len);
256  this_header->used_ -= len;
257  assert(InvariantOk());
258 }
259 
260 #else
261 void STRING::truncate_at(int32_t index) {
262  ASSERT_HOST(index >= 0);
263  FixHeader();
264  char* this_cstr = ensure_cstr(index + 1);
265  this_cstr[index] = '\0';
266  GetHeader()->used_ = index + 1;
267  assert(InvariantOk());
268 }
269 
270 char& STRING::operator[](int32_t index) const {
271  // Code is casting away this const and mutating the string,
272  // so mark used_ as -1 to flag it unreliable.
273  GetHeader()->used_ = -1;
274  return (const_cast<char *>(GetCStr()))[index];
275 }
276 #endif
277 
278 void STRING::split(const char c, GenericVector<STRING> *splited) {
279  int start_index = 0;
280  const int len = length();
281  for (int i = 0; i < len; i++) {
282  if ((*this)[i] == c) {
283  if (i != start_index) {
284  (*this)[i] = '\0';
285  splited->push_back(STRING(GetCStr() + start_index, i - start_index));
286  (*this)[i] = c;
287  }
288  start_index = i + 1;
289  }
290  }
291 
292  if (len != start_index) {
293  splited->push_back(STRING(GetCStr() + start_index, len - start_index));
294  }
295 }
296 
297 bool STRING::operator==(const STRING& str) const {
298  FixHeader();
299  str.FixHeader();
300  const STRING_HEADER* str_header = str.GetHeader();
301  const STRING_HEADER* this_header = GetHeader();
302  const int this_used = this_header->used_;
303  const int str_used = str_header->used_;
304 
305  return (this_used == str_used)
306  && (memcmp(GetCStr(), str.GetCStr(), this_used) == 0);
307 }
308 
309 bool STRING::operator!=(const STRING& str) const {
310  FixHeader();
311  str.FixHeader();
312  const STRING_HEADER* str_header = str.GetHeader();
313  const STRING_HEADER* this_header = GetHeader();
314  const int this_used = this_header->used_;
315  const int str_used = str_header->used_;
316 
317  return (this_used != str_used)
318  || (memcmp(GetCStr(), str.GetCStr(), this_used) != 0);
319 }
320 
321 bool STRING::operator!=(const char* cstr) const {
322  FixHeader();
323  const STRING_HEADER* this_header = GetHeader();
324 
325  if (cstr == nullptr)
326  return this_header->used_ > 1; // either '\0' or nullptr
327  else {
328  const int32_t length = strlen(cstr) + 1;
329  return (this_header->used_ != length)
330  || (memcmp(GetCStr(), cstr, length) != 0);
331  }
332 }
333 
334 STRING& STRING::operator=(const STRING& str) {
335  str.FixHeader();
336  const STRING_HEADER* str_header = str.GetHeader();
337  const int str_used = str_header->used_;
338 
339  GetHeader()->used_ = 0; // clear since ensure doesn't need to copy data
340  char* this_cstr = ensure_cstr(str_used);
341  STRING_HEADER* this_header = GetHeader();
342 
343  memcpy(this_cstr, str.GetCStr(), str_used);
344  this_header->used_ = str_used;
345 
346  assert(InvariantOk());
347  return *this;
348 }
349 
350 STRING & STRING::operator+=(const STRING& str) {
351  FixHeader();
352  str.FixHeader();
353  const STRING_HEADER* str_header = str.GetHeader();
354  const char* str_cstr = str.GetCStr();
355  const int str_used = str_header->used_;
356  const int this_used = GetHeader()->used_;
357  char* this_cstr = ensure_cstr(this_used + str_used);
358 
359  STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
360 
361  if (this_used > 1) {
362  memcpy(this_cstr + this_used - 1, str_cstr, str_used);
363  this_header->used_ += str_used - 1; // overwrite '\0'
364  } else {
365  memcpy(this_cstr, str_cstr, str_used);
366  this_header->used_ = str_used;
367  }
368 
369  assert(InvariantOk());
370  return *this;
371 }
372 
373 void STRING::add_str_int(const char* str, int number) {
374  if (str != nullptr)
375  *this += str;
376  // Allow space for the maximum possible length of int64_t.
377  char num_buffer[kMaxIntSize];
378  snprintf(num_buffer, kMaxIntSize - 1, "%d", number);
379  num_buffer[kMaxIntSize - 1] = '\0';
380  *this += num_buffer;
381 }
382 // Appends the given string and double (as a %.8g) to this.
383 void STRING::add_str_double(const char* str, double number) {
384  if (str != nullptr)
385  *this += str;
386  std::stringstream stream;
387  // Use "C" locale (needed for double value).
388  stream.imbue(std::locale::classic());
389  // Use 8 digits for double value.
390  stream.precision(8);
391  stream << number;
392  *this += stream.str().c_str();
393 }
394 
395 STRING & STRING::operator=(const char* cstr) {
396  STRING_HEADER* this_header = GetHeader();
397  if (cstr) {
398  const int len = strlen(cstr) + 1;
399 
400  this_header->used_ = 0; // don't bother copying data if need to realloc
401  char* this_cstr = ensure_cstr(len);
402  this_header = GetHeader(); // for realloc
403  memcpy(this_cstr, cstr, len);
404  this_header->used_ = len;
405  } else {
406  // Reallocate to same state as default constructor.
407  DiscardData();
408  // Empty STRINGs contain just the "\0".
409  memcpy(AllocData(1, kMinCapacity), "", 1);
410  }
411 
412  assert(InvariantOk());
413  return *this;
414 }
415 
416 void STRING::assign(const char *cstr, int len) {
417  STRING_HEADER* this_header = GetHeader();
418  this_header->used_ = 0; // don't bother copying data if need to realloc
419  char* this_cstr = ensure_cstr(len + 1); // +1 for '\0'
420 
421  this_header = GetHeader(); // for realloc
422  memcpy(this_cstr, cstr, len);
423  this_cstr[len] = '\0';
424  this_header->used_ = len + 1;
425 
426  assert(InvariantOk());
427 }
428 
429 STRING STRING::operator+(const STRING& str) const {
430  STRING result(*this);
431  result += str;
432 
433  assert(InvariantOk());
434  return result;
435 }
436 
437 
438 STRING STRING::operator+(const char ch) const {
439  STRING result;
440  FixHeader();
441  const STRING_HEADER* this_header = GetHeader();
442  const int this_used = this_header->used_;
443  char* result_cstr = result.ensure_cstr(this_used + 1);
444  STRING_HEADER* result_header = result.GetHeader();
445  const int result_used = result_header->used_;
446 
447  // copies '\0' but we'll overwrite that
448  memcpy(result_cstr, GetCStr(), this_used);
449  result_cstr[result_used] = ch; // overwrite old '\0'
450  result_cstr[result_used + 1] = '\0'; // append on '\0'
451  ++result_header->used_;
452 
453  assert(InvariantOk());
454  return result;
455 }
456 
457 
458 STRING& STRING::operator+=(const char *str) {
459  if (!str || !*str) // empty string has no effect
460  return *this;
461 
462  FixHeader();
463  const int len = strlen(str) + 1;
464  const int this_used = GetHeader()->used_;
465  char* this_cstr = ensure_cstr(this_used + len);
466  STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
467 
468  // if we had non-empty string then append overwriting old '\0'
469  // otherwise replace
470  if (this_used > 0) {
471  memcpy(this_cstr + this_used - 1, str, len);
472  this_header->used_ += len - 1;
473  } else {
474  memcpy(this_cstr, str, len);
475  this_header->used_ = len;
476  }
477 
478  assert(InvariantOk());
479  return *this;
480 }
481 
482 
483 STRING& STRING::operator+=(const char ch) {
484  if (ch == '\0')
485  return *this;
486 
487  FixHeader();
488  int this_used = GetHeader()->used_;
489  char* this_cstr = ensure_cstr(this_used + 1);
490  STRING_HEADER* this_header = GetHeader();
491 
492  if (this_used > 0)
493  --this_used; // undo old empty null if there was one
494 
495  this_cstr[this_used++] = ch; // append ch to end
496  this_cstr[this_used++] = '\0'; // append '\0' after ch
497  this_header->used_ = this_used;
498 
499  assert(InvariantOk());
500  return *this;
501 }
strngs.h
kMaxIntSize
const int kMaxIntSize
Definition: strngs.cpp:32
STRING::add_str_int
void add_str_int(const char *str, int number)
Definition: strngs.cpp:370
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
STRING::operator==
bool operator==(const STRING &string) const
Definition: strngs.cpp:294
STRING::operator!=
bool operator!=(const STRING &string) const
Definition: strngs.cpp:306
STRING
Definition: strngs.h:45
STRING::truncate_at
void truncate_at(int32_t index)
Definition: strngs.cpp:258
kMinCapacity
const int kMinCapacity
Definition: strngs.cpp:47
STRING::operator+=
STRING & operator+=(const char *string)
Definition: strngs.cpp:455
STRING::DeSerialize
bool DeSerialize(bool swap, FILE *fp)
Definition: strngs.cpp:157
STRING::Serialize
bool Serialize(FILE *fp) const
Definition: strngs.cpp:144
genericvector.h
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::TFile::DeSerialize
bool DeSerialize(char *data, size_t count=1)
Definition: serialis.cpp:117
tesseract::TFile::Serialize
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:161
tesseract::TFile
Definition: serialis.h:75
STRING::~STRING
~STRING()
Definition: strngs.cpp:138
helpers.h
GenericVector< STRING >
STRING::operator+
STRING operator+(const STRING &string) const
Definition: strngs.cpp:426
STRING::length
int32_t length() const
Definition: strngs.cpp:187
STRING::contains
bool contains(char c) const
Definition: strngs.cpp:183
STRING::SkipDeSerialize
static bool SkipDeSerialize(tesseract::TFile *fp)
Definition: strngs.cpp:177
tesseract::TFile::Skip
bool Skip(size_t count)
Definition: serialis.cpp:205
STRING::assign
void assign(const char *cstr, int len)
Definition: strngs.cpp:413
STRING::add_str_double
void add_str_double(const char *str, double number)
Definition: strngs.cpp:380
STRING::operator[]
char & operator[](int32_t index) const
Definition: strngs.cpp:267
STRING::operator=
STRING & operator=(const char *string)
Definition: strngs.cpp:392
errcode.h
STRING::STRING
STRING()
Definition: strngs.cpp:101
serialis.h
ReverseN
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:183
tesseract::DeSerialize
bool DeSerialize(FILE *fp, char *data, size_t n=1)
Definition: serialis.cpp:41
tesseract::Serialize
bool Serialize(FILE *fp, const char *data, size_t n=1)
Definition: serialis.cpp:73
STRING::split
void split(char c, GenericVector< STRING > *splited)
Definition: strngs.cpp:275