tesseract  5.0.0-alpha-619-ge9db
normstrngs_test.h
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #ifndef TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_
13 #define TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_
14 
15 #include <sstream> // for std::stringstream
16 #include <string>
17 #include <vector>
18 #include "absl/strings/str_cat.h"
19 #include "absl/strings/str_join.h"
20 #include <tesseract/unichar.h>
21 
22 namespace tesseract {
23 
24 inline std::string CodepointList(const std::vector<char32>& str32) {
25  std::stringstream result;
26  int total_chars = str32.size();
27  result << std::hex;
28  for (int i = 0; i < total_chars; ++i) {
29  result << "[" << str32[i] << "]";
30  }
31  return result.str();
32 }
33 
35  std::vector<char32> str32 = UNICHAR::UTF8ToUTF32(str.c_str());
36  return absl::StrCat("\"", str, "\" ", CodepointList(str32));
37 }
38 
39 inline std::string PrintStringVectorWithUnicodes(const std::vector<std::string>& glyphs) {
40  std::string result;
41  for (const auto& s : glyphs) {
42  result += "Glyph:";
43  result += PrintString32WithUnicodes(s) + "\n";
44  }
45  return result;
46 }
47 
48 inline void ExpectGraphemeModeResults(const std::string& str, UnicodeNormMode u_mode,
49  int unicode_count, int glyph_count,
50  int grapheme_count,
51  const std::string& target_str) {
52  std::vector<std::string> glyphs;
53  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
55  str.c_str(), &glyphs));
56  EXPECT_EQ(glyphs.size(), unicode_count)
58  EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), ""));
59  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone,
61  str.c_str(), &glyphs));
62  EXPECT_EQ(glyphs.size(), glyph_count)
64  EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), ""));
65  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone,
67  str.c_str(), &glyphs));
68  EXPECT_EQ(glyphs.size(), grapheme_count)
70  EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), ""));
71  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone,
73  true, str.c_str(), &glyphs));
74  EXPECT_EQ(glyphs.size(), 1) << PrintStringVectorWithUnicodes(glyphs);
75  EXPECT_EQ(target_str, glyphs[0]);
76  std::string result;
77  EXPECT_TRUE(NormalizeUTF8String(
78  u_mode, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result));
79  EXPECT_EQ(target_str, result);
80 }
81 
82 } // namespace tesseract
83 
84 #endif // TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_
tesseract::OCRNorm::kNone
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::UnicodeNormMode
UnicodeNormMode
Definition: normstrngs.h:48
tesseract::NormalizeUTF8String
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:163
tesseract::PrintStringVectorWithUnicodes
std::string PrintStringVectorWithUnicodes(const std::vector< std::string > &glyphs)
Definition: normstrngs_test.h:39
tesseract::UNICHAR::UTF8ToUTF32
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:215
tesseract::NormalizeCleanAndSegmentUTF8
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:188
tesseract::GraphemeNormMode::kSingleString
tesseract::GraphemeNormMode::kGlyphSplit
tesseract::PrintString32WithUnicodes
std::string PrintString32WithUnicodes(const std::string &str)
Definition: normstrngs_test.h:34
tesseract
Definition: baseapi.h:65
tesseract::ExpectGraphemeModeResults
void ExpectGraphemeModeResults(const std::string &str, UnicodeNormMode u_mode, int unicode_count, int glyph_count, int grapheme_count, const std::string &target_str)
Definition: normstrngs_test.h:48
tesseract::CodepointList
std::string CodepointList(const std::vector< char32 > &str32)
Definition: normstrngs_test.h:24
unichar.h
tesseract::GraphemeNormMode::kIndividualUnicodes
tesseract::GraphemeNormMode::kCombined
tesseract::OCRNorm::kNormalize