tesseract  4.0.0-1-g2a2b
strngs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: strngs.cpp (Formerly strings.c)
3  * Description: STRING class functions.
4  * Author: Ray Smith
5  * Created: Fri Feb 15 09:13:30 GMT 1991
6  *
7  * (C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "strngs.h"
21 #include <cassert> // for assert
22 #include <cstdlib> // for malloc, free
23 #include "errcode.h" // for ASSERT_HOST
24 #include "genericvector.h" // for GenericVector
25 #include "helpers.h" // for ReverseN
26 #include "serialis.h" // for TFile
27 
28 using tesseract::TFile;
29 
30 // Size of buffer needed to host the decimal representation of the maximum
31 // possible length of an int (in 64 bits), being -<20 digits>.
32 const int kMaxIntSize = 22;
33 // Size of buffer needed to host the decimal representation of the maximum
34 // possible length of a %.8g being -1.2345678e+999<nul> = 16.
35 const int kMaxDoubleSize = 16;
36 
37 /**********************************************************************
38  * STRING_HEADER provides metadata about the allocated buffer,
39  * including total capacity and how much used (strlen with '\0').
40  *
41  * The implementation hides this header at the start of the data
42  * buffer and appends the string on the end to keep sizeof(STRING)
43  * unchanged from earlier versions so serialization is not affected.
44  *
45  * The collection of MACROS provide different implementations depending
46  * on whether the string keeps track of its strlen or not so that this
47  * feature can be added in later when consumers don't modify the string
48  **********************************************************************/
49 
50 // Smallest string to allocate by default
51 const int kMinCapacity = 16;
52 
53 char* STRING::AllocData(int used, int capacity) {
54  data_ = (STRING_HEADER *)malloc(capacity + sizeof(STRING_HEADER));
55 
56  // header is the metadata for this memory block
57  STRING_HEADER* header = GetHeader();
58  header->capacity_ = capacity;
59  header->used_ = used;
60  return GetCStr();
61 }
62 
63 void STRING::DiscardData() {
64  free(data_);
65  data_ = nullptr;
66 }
67 
68 // This is a private method; ensure FixHeader is called (or used_ is well defined)
69 // beforehand
70 char* STRING::ensure_cstr(int32_t min_capacity) {
71  STRING_HEADER* orig_header = GetHeader();
72  if (min_capacity <= orig_header->capacity_)
73  return ((char *)this->data_) + sizeof(STRING_HEADER);
74 
75  // if we are going to grow bigger, than double our existing
76  // size, but if that still is not big enough then keep the
77  // requested capacity
78  if (min_capacity < 2 * orig_header->capacity_)
79  min_capacity = 2 * orig_header->capacity_;
80 
81  int alloc = sizeof(STRING_HEADER) + min_capacity;
82  STRING_HEADER* new_header = (STRING_HEADER*)(malloc(alloc));
83 
84  memcpy(&new_header[1], GetCStr(), orig_header->used_);
85  new_header->capacity_ = min_capacity;
86  new_header->used_ = orig_header->used_;
87 
88  // free old memory, then rebind to new memory
89  DiscardData();
90  data_ = new_header;
91 
92  assert(InvariantOk());
93  return ((char *)data_) + sizeof(STRING_HEADER);
94 }
95 
96 // This is const, but is modifying a mutable field
97 // this way it can be used on const or non-const instances.
98 void STRING::FixHeader() const {
99  const STRING_HEADER* header = GetHeader();
100  if (header->used_ < 0)
101  header->used_ = strlen(GetCStr()) + 1;
102 }
103 
104 
106  // Empty STRINGs contain just the "\0".
107  memcpy(AllocData(1, kMinCapacity), "", 1);
108 }
109 
110 STRING::STRING(const STRING& str) {
111  str.FixHeader();
112  const STRING_HEADER* str_header = str.GetHeader();
113  const int str_used = str_header->used_;
114  char *this_cstr = AllocData(str_used, str_used);
115  memcpy(this_cstr, str.GetCStr(), str_used);
116  assert(InvariantOk());
117 }
118 
119 STRING::STRING(const char* cstr) {
120  if (cstr == nullptr) {
121  // Empty STRINGs contain just the "\0".
122  memcpy(AllocData(1, kMinCapacity), "", 1);
123  } else {
124  const int len = strlen(cstr) + 1;
125  char* this_cstr = AllocData(len, len);
126  memcpy(this_cstr, cstr, len);
127  }
128  assert(InvariantOk());
129 }
130 
131 STRING::STRING(const char *data, int length) {
132  if (data == nullptr) {
133  // Empty STRINGs contain just the "\0".
134  memcpy(AllocData(1, kMinCapacity), "", 1);
135  } else {
136  char* this_cstr = AllocData(length + 1, length + 1);
137  memcpy(this_cstr, data, length);
138  this_cstr[length] = '\0';
139  }
140 }
141 
143  DiscardData();
144 }
145 
146 // TODO(rays) Change all callers to use TFile and remove the old functions.
147 // Writes to the given file. Returns false in case of error.
148 bool STRING::Serialize(FILE* fp) const {
149  uint32_t len = length();
150  return tesseract::Serialize(fp, &len) &&
151  tesseract::Serialize(fp, GetCStr(), len);
152 }
153 // Writes to the given file. Returns false in case of error.
154 bool STRING::Serialize(TFile* fp) const {
155  uint32_t len = length();
156  return fp->Serialize(&len) &&
157  fp->Serialize(GetCStr(), len);
158 }
159 // Reads from the given file. Returns false in case of error.
160 // If swap is true, assumes a big/little-endian swap is needed.
161 bool STRING::DeSerialize(bool swap, FILE* fp) {
162  uint32_t len;
163  if (!tesseract::DeSerialize(fp, &len)) return false;
164  if (swap)
165  ReverseN(&len, sizeof(len));
166  // Arbitrarily limit the number of characters to protect against bad data.
167  if (len > UINT16_MAX) return false;
168  truncate_at(len);
169  return tesseract::DeSerialize(fp, GetCStr(), len);
170 }
171 // Reads from the given file. Returns false in case of error.
172 // If swap is true, assumes a big/little-endian swap is needed.
174  uint32_t len;
175  if (!fp->DeSerialize(&len)) return false;
176  truncate_at(len);
177  return fp->DeSerialize(GetCStr(), len);
178 }
179 
180 // As DeSerialize, but only seeks past the data - hence a static method.
182  uint32_t len;
183  if (!fp->DeSerialize(&len)) return false;
184  return fp->Skip(len);
185 }
186 
187 bool STRING::contains(const char c) const {
188  return (c != '\0') && (strchr (GetCStr(), c) != nullptr);
189 }
190 
191 int32_t STRING::length() const {
192  FixHeader();
193  return GetHeader()->used_ - 1;
194 }
195 
196 const char* STRING::string() const {
197  const STRING_HEADER* header = GetHeader();
198  if (header->used_ == 0)
199  return nullptr;
200 
201  // mark header length unreliable because tesseract might
202  // cast away the const and mutate the string directly.
203  header->used_ = -1;
204  return GetCStr();
205 }
206 
207 const char* STRING::c_str() const {
208  return string();
209 }
210 
211 /******
212  * The STRING_IS_PROTECTED interface adds additional support to migrate
213  * code that needs to modify the STRING in ways not otherwise supported
214  * without violating encapsulation.
215  *
216  * Also makes the [] operator return a const so it is immutable
217  */
218 #if STRING_IS_PROTECTED
219 const char& STRING::operator[](int32_t index) const {
220  return GetCStr()[index];
221 }
222 
223 void STRING::insert_range(int32_t index, const char* str, int len) {
224  // if index is outside current range, then also grow size of string
225  // to accmodate the requested range.
226  STRING_HEADER* this_header = GetHeader();
227  int used = this_header->used_;
228  if (index > used)
229  used = index;
230 
231  char* this_cstr = ensure_cstr(used + len + 1);
232  if (index < used) {
233  // move existing string from index to '\0' inclusive.
234  memmove(this_cstr + index + len,
235  this_cstr + index,
236  this_header->used_ - index);
237  } else if (len > 0) {
238  // We are going to overwrite previous null terminator, so write the new one.
239  this_cstr[this_header->used_ + len - 1] = '\0';
240 
241  // If the old header did not have the terminator,
242  // then we need to account for it now that we've added it.
243  // Otherwise it was already accounted for; we just moved it.
244  if (this_header->used_ == 0)
245  ++this_header->used_;
246  }
247 
248  // Write new string to index.
249  // The string is already terminated from the conditions above.
250  memcpy(this_cstr + index, str, len);
251  this_header->used_ += len;
252 
253  assert(InvariantOk());
254 }
255 
256 void STRING::erase_range(int32_t index, int len) {
257  char* this_cstr = GetCStr();
258  STRING_HEADER* this_header = GetHeader();
259 
260  memcpy(this_cstr+index, this_cstr+index+len,
261  this_header->used_ - index - len);
262  this_header->used_ -= len;
263  assert(InvariantOk());
264 }
265 
266 #else
267 void STRING::truncate_at(int32_t index) {
268  ASSERT_HOST(index >= 0);
269  FixHeader();
270  char* this_cstr = ensure_cstr(index + 1);
271  this_cstr[index] = '\0';
272  GetHeader()->used_ = index + 1;
273  assert(InvariantOk());
274 }
275 
276 char& STRING::operator[](int32_t index) const {
277  // Code is casting away this const and mutating the string,
278  // so mark used_ as -1 to flag it unreliable.
279  GetHeader()->used_ = -1;
280  return ((char *)GetCStr())[index];
281 }
282 #endif
283 
284 void STRING::split(const char c, GenericVector<STRING> *splited) {
285  int start_index = 0;
286  const int len = length();
287  for (int i = 0; i < len; i++) {
288  if ((*this)[i] == c) {
289  if (i != start_index) {
290  (*this)[i] = '\0';
291  splited->push_back(STRING(GetCStr() + start_index, i - start_index));
292  (*this)[i] = c;
293  }
294  start_index = i + 1;
295  }
296  }
297 
298  if (len != start_index) {
299  splited->push_back(STRING(GetCStr() + start_index, len - start_index));
300  }
301 }
302 
303 bool STRING::operator==(const STRING& str) const {
304  FixHeader();
305  str.FixHeader();
306  const STRING_HEADER* str_header = str.GetHeader();
307  const STRING_HEADER* this_header = GetHeader();
308  const int this_used = this_header->used_;
309  const int str_used = str_header->used_;
310 
311  return (this_used == str_used)
312  && (memcmp(GetCStr(), str.GetCStr(), this_used) == 0);
313 }
314 
315 bool STRING::operator!=(const STRING& str) const {
316  FixHeader();
317  str.FixHeader();
318  const STRING_HEADER* str_header = str.GetHeader();
319  const STRING_HEADER* this_header = GetHeader();
320  const int this_used = this_header->used_;
321  const int str_used = str_header->used_;
322 
323  return (this_used != str_used)
324  || (memcmp(GetCStr(), str.GetCStr(), this_used) != 0);
325 }
326 
327 bool STRING::operator!=(const char* cstr) const {
328  FixHeader();
329  const STRING_HEADER* this_header = GetHeader();
330 
331  if (cstr == nullptr)
332  return this_header->used_ > 1; // either '\0' or nullptr
333  else {
334  const int32_t length = strlen(cstr) + 1;
335  return (this_header->used_ != length)
336  || (memcmp(GetCStr(), cstr, length) != 0);
337  }
338 }
339 
341  str.FixHeader();
342  const STRING_HEADER* str_header = str.GetHeader();
343  const int str_used = str_header->used_;
344 
345  GetHeader()->used_ = 0; // clear since ensure doesn't need to copy data
346  char* this_cstr = ensure_cstr(str_used);
347  STRING_HEADER* this_header = GetHeader();
348 
349  memcpy(this_cstr, str.GetCStr(), str_used);
350  this_header->used_ = str_used;
351 
352  assert(InvariantOk());
353  return *this;
354 }
355 
357  FixHeader();
358  str.FixHeader();
359  const STRING_HEADER* str_header = str.GetHeader();
360  const char* str_cstr = str.GetCStr();
361  const int str_used = str_header->used_;
362  const int this_used = GetHeader()->used_;
363  char* this_cstr = ensure_cstr(this_used + str_used);
364 
365  STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
366 
367  if (this_used > 1) {
368  memcpy(this_cstr + this_used - 1, str_cstr, str_used);
369  this_header->used_ += str_used - 1; // overwrite '\0'
370  } else {
371  memcpy(this_cstr, str_cstr, str_used);
372  this_header->used_ = str_used;
373  }
374 
375  assert(InvariantOk());
376  return *this;
377 }
378 
379 void STRING::add_str_int(const char* str, int number) {
380  if (str != nullptr)
381  *this += str;
382  // Allow space for the maximum possible length of int64_t.
383  char num_buffer[kMaxIntSize];
384  snprintf(num_buffer, kMaxIntSize - 1, "%d", number);
385  num_buffer[kMaxIntSize - 1] = '\0';
386  *this += num_buffer;
387 }
388 // Appends the given string and double (as a %.8g) to this.
389 void STRING::add_str_double(const char* str, double number) {
390  if (str != nullptr)
391  *this += str;
392  // Allow space for the maximum possible length of %8g.
393  char num_buffer[kMaxDoubleSize];
394  snprintf(num_buffer, kMaxDoubleSize - 1, "%.8g", number);
395  num_buffer[kMaxDoubleSize - 1] = '\0';
396  *this += num_buffer;
397 }
398 
399 STRING & STRING::operator=(const char* cstr) {
400  STRING_HEADER* this_header = GetHeader();
401  if (cstr) {
402  const int len = strlen(cstr) + 1;
403 
404  this_header->used_ = 0; // don't bother copying data if need to realloc
405  char* this_cstr = ensure_cstr(len);
406  this_header = GetHeader(); // for realloc
407  memcpy(this_cstr, cstr, len);
408  this_header->used_ = len;
409  } else {
410  // Reallocate to same state as default constructor.
411  DiscardData();
412  // Empty STRINGs contain just the "\0".
413  memcpy(AllocData(1, kMinCapacity), "", 1);
414  }
415 
416  assert(InvariantOk());
417  return *this;
418 }
419 
420 void STRING::assign(const char *cstr, int len) {
421  STRING_HEADER* this_header = GetHeader();
422  this_header->used_ = 0; // don't bother copying data if need to realloc
423  char* this_cstr = ensure_cstr(len + 1); // +1 for '\0'
424 
425  this_header = GetHeader(); // for realloc
426  memcpy(this_cstr, cstr, len);
427  this_cstr[len] = '\0';
428  this_header->used_ = len + 1;
429 
430  assert(InvariantOk());
431 }
432 
433 STRING STRING::operator+(const STRING& str) const {
434  STRING result(*this);
435  result += str;
436 
437  assert(InvariantOk());
438  return result;
439 }
440 
441 
442 STRING STRING::operator+(const char ch) const {
443  STRING result;
444  FixHeader();
445  const STRING_HEADER* this_header = GetHeader();
446  const int this_used = this_header->used_;
447  char* result_cstr = result.ensure_cstr(this_used + 1);
448  STRING_HEADER* result_header = result.GetHeader();
449  const int result_used = result_header->used_;
450 
451  // copies '\0' but we'll overwrite that
452  memcpy(result_cstr, GetCStr(), this_used);
453  result_cstr[result_used] = ch; // overwrite old '\0'
454  result_cstr[result_used + 1] = '\0'; // append on '\0'
455  ++result_header->used_;
456 
457  assert(InvariantOk());
458  return result;
459 }
460 
461 
462 STRING& STRING::operator+=(const char *str) {
463  if (!str || !*str) // empty string has no effect
464  return *this;
465 
466  FixHeader();
467  const int len = strlen(str) + 1;
468  const int this_used = GetHeader()->used_;
469  char* this_cstr = ensure_cstr(this_used + len);
470  STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
471 
472  // if we had non-empty string then append overwriting old '\0'
473  // otherwise replace
474  if (this_used > 0) {
475  memcpy(this_cstr + this_used - 1, str, len);
476  this_header->used_ += len - 1;
477  } else {
478  memcpy(this_cstr, str, len);
479  this_header->used_ = len;
480  }
481 
482  assert(InvariantOk());
483  return *this;
484 }
485 
486 
487 STRING& STRING::operator+=(const char ch) {
488  if (ch == '\0')
489  return *this;
490 
491  FixHeader();
492  int this_used = GetHeader()->used_;
493  char* this_cstr = ensure_cstr(this_used + 1);
494  STRING_HEADER* this_header = GetHeader();
495 
496  if (this_used > 0)
497  --this_used; // undo old empty null if there was one
498 
499  this_cstr[this_used++] = ch; // append ch to end
500  this_cstr[this_used++] = '\0'; // append '\0' after ch
501  this_header->used_ = this_used;
502 
503  assert(InvariantOk());
504  return *this;
505 }
bool Serialize(FILE *fp) const
Definition: strngs.cpp:148
const int kMinCapacity
Definition: strngs.cpp:51
STRING & operator+=(const char *string)
Definition: strngs.cpp:462
bool operator==(const STRING &string) const
Definition: strngs.cpp:303
bool DeSerialize(bool swap, FILE *fp)
Definition: strngs.cpp:161
const char * string() const
Definition: strngs.cpp:196
bool DeSerialize(char *data, size_t count=1)
Definition: serialis.cpp:103
char & operator[](int32_t index) const
Definition: strngs.cpp:276
STRING operator+(const STRING &string) const
Definition: strngs.cpp:433
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:59
static bool SkipDeSerialize(tesseract::TFile *fp)
Definition: strngs.cpp:181
STRING & operator=(const char *string)
Definition: strngs.cpp:399
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:178
const char * c_str() const
Definition: strngs.cpp:207
void split(const char c, GenericVector< STRING > *splited)
Definition: strngs.cpp:284
const int kMaxDoubleSize
Definition: strngs.cpp:35
void add_str_double(const char *str, double number)
Definition: strngs.cpp:389
bool Skip(size_t count)
Definition: serialis.cpp:191
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:147
void truncate_at(int32_t index)
Definition: strngs.cpp:267
int push_back(T object)
STRING()
Definition: strngs.cpp:105
void add_str_int(const char *str, int number)
Definition: strngs.cpp:379
const int kMaxIntSize
Definition: strngs.cpp:32
Definition: strngs.h:45
bool contains(const char c) const
Definition: strngs.cpp:187
bool operator!=(const STRING &string) const
Definition: strngs.cpp:315
~STRING()
Definition: strngs.cpp:142
bool DeSerialize(FILE *fp, char *data, size_t n)
Definition: serialis.cpp:27
int32_t length() const
Definition: strngs.cpp:191
void assign(const char *cstr, int len)
Definition: strngs.cpp:420
#define ASSERT_HOST(x)
Definition: errcode.h:84