54 file_name = data_file_path +
lang;
55 file_name +=
".cube.bigrams";
64 if (char_bigrams_obj ==
NULL) {
65 fprintf(stderr,
"Cube ERROR (CharBigrams::Create): could not create "
66 "character bigrams object.\n");
69 CharBigramTable *table = &char_bigrams_obj->bigram_table_;
73 table->char_bigram =
NULL;
76 vector<string> str_vec;
79 for (
int big = 0; big < str_vec.size(); big++) {
83 if (sscanf(str_vec[big].c_str(),
"%d %x %x", &cnt, &ch1, &ch2) != 3) {
84 fprintf(stderr,
"Cube ERROR (CharBigrams::Create): invalid format "
85 "reading line: %s\n", str_vec[big].c_str());
86 delete char_bigrams_obj;
91 if (ch1 > table->max_char) {
92 CharBigram *char_bigram =
new CharBigram[ch1 + 1];
93 if (char_bigram ==
NULL) {
94 fprintf(stderr,
"Cube ERROR (CharBigrams::Create): error allocating "
95 "additional memory for character bigram table.\n");
99 if (table->char_bigram !=
NULL && table->max_char >= 0) {
100 memcpy(char_bigram, table->char_bigram,
101 (table->max_char + 1) *
sizeof(*char_bigram));
103 delete []table->char_bigram;
105 table->char_bigram = char_bigram;
108 for (
int new_big = table->max_char + 1; new_big <= ch1; new_big++) {
109 table->char_bigram[new_big].total_cnt = 0;
110 table->char_bigram[new_big].max_char = -1;
111 table->char_bigram[new_big].bigram =
NULL;
113 table->max_char = ch1;
116 if (ch2 > table->char_bigram[ch1].max_char) {
117 Bigram *bigram =
new Bigram[ch2 + 1];
118 if (bigram ==
NULL) {
119 fprintf(stderr,
"Cube ERROR (CharBigrams::Create): error allocating "
120 "memory for bigram.\n");
121 delete char_bigrams_obj;
125 if (table->char_bigram[ch1].bigram !=
NULL &&
126 table->char_bigram[ch1].max_char >= 0) {
127 memcpy(bigram, table->char_bigram[ch1].bigram,
128 (table->char_bigram[ch1].max_char + 1) *
sizeof(*bigram));
129 delete []table->char_bigram[ch1].bigram;
131 table->char_bigram[ch1].bigram = bigram;
134 for (
int new_big = table->char_bigram[ch1].max_char + 1;
135 new_big <= ch2; new_big++) {
136 table->char_bigram[ch1].bigram[new_big].cnt = 0;
138 table->char_bigram[ch1].max_char = ch2;
141 table->char_bigram[ch1].bigram[ch2].cnt = cnt;
142 table->char_bigram[ch1].total_cnt += cnt;
143 table->total_cnt += cnt;
147 table->worst_cost =
static_cast<int>(
149 for (
char_32 ch1 = 0; ch1 <= table->max_char; ch1++) {
150 for (
char_32 ch2 = 0; ch2 <= table->char_bigram[ch1].max_char; ch2++) {
151 int cnt = table->char_bigram[ch1].bigram[ch2].cnt;
152 table->char_bigram[ch1].bigram[ch2].cost =
154 log(
MAX(0.5, static_cast<double>(cnt)) /
158 return char_bigrams_obj;
static bool ReadFileToString(const string &file_name, string *str)
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)