33 memset(&bigram_table_, 0,
sizeof(bigram_table_));
38 for (
int ch1 = 0; ch1 <= bigram_table_.
max_char; ch1++) {
42 delete []char_bigram->
bigram;
54 file_name = data_file_path +
lang;
55 file_name +=
".cube.bigrams";
64 if (char_bigrams_obj ==
NULL) {
65 fprintf(stderr,
"Cube ERROR (CharBigrams::Create): could not create "
66 "character bigrams object.\n");
76 vector<string> str_vec;
79 for (
int big = 0; big < str_vec.size(); big++) {
83 if (sscanf(str_vec[big].c_str(),
"%d %x %x", &cnt, &ch1, &ch2) != 3) {
84 fprintf(stderr,
"Cube ERROR (CharBigrams::Create): invalid format "
85 "reading line: %s\n", str_vec[big].c_str());
86 delete char_bigrams_obj;
93 if (char_bigram ==
NULL) {
94 fprintf(stderr,
"Cube ERROR (CharBigrams::Create): error allocating "
95 "additional memory for character bigram table.\n");
101 (table->
max_char + 1) *
sizeof(*char_bigram));
108 for (
int new_big = table->
max_char + 1; new_big <= ch1; new_big++) {
118 if (bigram ==
NULL) {
119 fprintf(stderr,
"Cube ERROR (CharBigrams::Create): error allocating "
120 "memory for bigram.\n");
121 delete char_bigrams_obj;
135 new_big <= ch2; new_big++) {
154 log(
MAX(0.5, static_cast<double>(cnt)) /
158 return char_bigrams_obj;
172 if (!char_32_ptr || char_32_ptr[0] == 0) {
179 if (lower_32 && lower_32[0] != 0) {
181 cost =
MIN(cost, cost_lower);
185 if (upper_32 && upper_32[0] != 0) {
187 cost =
MIN(cost, cost_upper);
200 cost =
PairCost(
' ', char_32_ptr[0]);
201 for (c = 1; c < len; c++) {
202 cost +=
PairCost(char_32_ptr[c - 1], char_32_ptr[c]);
204 cost +=
PairCost(char_32_ptr[len - 1],
' ');
205 return static_cast<int>(cost /
static_cast<double>(len + 1));
static bool ReadFileToString(const string &file_name, string *str)
static CharBigrams * Create(const string &data_file_path, const string &lang)
int Cost(const char_32 *str, CharSet *char_set) const
static char_32 * ToLower(const char_32 *str32, CharSet *char_set)
int PairCost(char_32 ch1, char_32 ch2) const
static int StrLen(const char_32 *str)
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
int MeanCostWithSpaces(const char_32 *char_32_ptr) const
static char_32 * ToUpper(const char_32 *str32, CharSet *char_set)
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set)