896 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/{lang}.corpus.txt"
897 FILTER_ARGUMENTS = []
898 WORDLIST2DAWG_ARGUMENTS =
""
904 PUNC_DAWG_FACTOR =
None
905 NUMBER_DAWG_FACTOR = 0.125
906 WORD_DAWG_FACTOR = 0.05
907 BIGRAM_DAWG_FACTOR = 0.015
908 TRAINING_DATA_ARGUMENTS = []
909 FRAGMENTS_DISABLED =
"y"
910 RUN_SHAPE_CLUSTERING =
False
911 AMBIGS_FILTER_DENOMINATOR =
"100000"
918 TEXT2IMAGE_EXTRA_ARGS = []
921 GENERATE_WORD_BIGRAMS =
None
922 WORD_DAWG_SIZE =
None
926 TEXT2IMAGE_EXTRA_ARGS += [
"--ligatures"]
928 FONTS = EARLY_LATIN_FONTS
930 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/fra.corpus.txt"
932 FILTER_ARGUMENTS += [
"--make_early_language_variant=fra"]
933 TEXT2IMAGE_EXTRA_ARGS += [
"--ligatures"]
935 FONTS = EARLY_LATIN_FONTS
937 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/deu.corpus.txt"
939 FONTS = FRAKTUR_FONTS
940 elif lang ==
"ita_old":
941 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/ita.corpus.txt"
943 FILTER_ARGUMENTS += [
"--make_early_language_variant=ita"]
944 TEXT2IMAGE_EXTRA_ARGS += [
"--ligatures"]
946 FONTS = EARLY_LATIN_FONTS
949 EXPOSURES =
"-3 -2 -1 0 1 2 3".split()
951 FONTS = NEOLATIN_FONTS
952 elif lang ==
"spa_old":
953 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/spa.corpus.txt"
955 FILTER_ARGUMENTS += [
"--make_early_language_variant=spa"]
956 TEXT2IMAGE_EXTRA_ARGS += [
"--ligatures"]
958 FONTS = EARLY_LATIN_FONTS
959 elif lang ==
"srp_latn":
960 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/srp.corpus.txt"
962 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
964 FONTS = VIETNAMESE_FONTS
968 WORD_DAWG_SIZE = 1_000_000
970 WORD_DAWG_SIZE = 1_000_000
984 PUNC_DAWG_FACTOR = 0.004
990 WORD_DAWG_FACTOR = 0.125
992 WORD_DAWG_FACTOR = 0.03
1004 WORD_DAWG_FACTOR = 0.08
1007 elif lang ==
"gle_uncial":
1009 FONTS = IRISH_UNCIAL_FONTS
1016 elif lang ==
"iast":
1035 WORD_DAWG_FACTOR = 0.02
1065 elif lang ==
"lat_lid":
1066 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/lat_lid.corpus.txt"
1067 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1068 GENERATE_WORD_BIGRAMS = 0
1071 WORD_DAWG_SIZE = 1_000_000
1073 FONTS = EARLY_LATIN_FONTS
1078 FONTS = RUSSIAN_FONTS
1080 NUMBER_DAWG_FACTOR = 0.05
1081 WORD_DAWG_SIZE = 1_000_000
1093 MIX_LANG = f
"{lang}"
1095 FONTS = RUSSIAN_FONTS
1100 elif lang ==
"cyr_lid":
1101 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/cyr_lid.corpus.txt"
1102 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1103 GENERATE_WORD_BIGRAMS = 0
1104 WORD_DAWG_SIZE = 1_000_000
1106 FONTS = RUSSIAN_FONTS
1110 elif lang
in (
"asm",
"ben"):
1112 WORD_DAWG_FACTOR = 0.15
1114 FONTS = BENGALI_FONTS
1115 elif lang
in (
"bih",
"hin",
"mar",
"nep",
"san"):
1117 WORD_DAWG_FACTOR = 0.15
1119 FONTS = DEVANAGARI_FONTS
1122 WORD_DAWG_FACTOR = 0.15
1124 FONTS = TIBETAN_FONTS
1126 WORD_DAWG_FACTOR = 0.01
1128 FONTS = TIBETAN_FONTS
1131 WORD_DAWG_FACTOR = 0.15
1133 FONTS = GUJARATI_FONTS
1136 WORD_DAWG_FACTOR = 0.15
1137 TRAINING_DATA_ARGUMENTS += [
"--no_newline_in_output"]
1138 TEXT2IMAGE_EXTRA_ARGS += [
"--char_spacing=0.5"]
1140 FONTS = KANNADA_FONTS
1143 WORD_DAWG_FACTOR = 0.15
1144 TRAINING_DATA_ARGUMENTS += [
"--no_newline_in_output"]
1145 TEXT2IMAGE_EXTRA_ARGS += [
"--char_spacing=0.5"]
1147 FONTS = MALAYALAM_FONTS
1149 WORD_DAWG_FACTOR = 0.01
1154 WORD_DAWG_FACTOR = 0.01
1156 FONTS = PUNJABI_FONTS
1159 WORD_DAWG_FACTOR = 0.01
1161 FONTS = SINHALA_FONTS
1164 WORD_DAWG_FACTOR = 0.15
1165 TRAINING_DATA_ARGUMENTS += [
"--no_newline_in_output"]
1166 TEXT2IMAGE_EXTRA_ARGS += [
"--char_spacing=0.5"]
1171 WORD_DAWG_FACTOR = 0.15
1172 TRAINING_DATA_ARGUMENTS += [
"--no_newline_in_output"]
1173 TEXT2IMAGE_EXTRA_ARGS += [
"--char_spacing=0.5"]
1175 FONTS = TELUGU_FONTS
1178 elif lang ==
"jav_java":
1180 WORD_DAWG_FACTOR = 0.15
1181 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1183 FONTS = JAVANESE_FONTS
1186 WORD_DAWG_FACTOR = 0.15
1187 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1192 WORD_DAWG_FACTOR = 0.15
1193 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1195 FONTS = LAOTHIAN_FONTS
1198 WORD_DAWG_FACTOR = 0.15
1199 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1201 FONTS = BURMESE_FONTS
1204 WORD_DAWG_FACTOR = 0.01
1205 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1206 FILTER_ARGUMENTS += [
"--segmenter_lang=tha"]
1207 TRAINING_DATA_ARGUMENTS += [
"--no_space_in_output",
"--desired_bigrams="]
1208 AMBIGS_FILTER_DENOMINATOR =
"1000"
1214 elif lang ==
"chi_sim":
1216 PUNC_DAWG_FACTOR = 0.015
1217 WORD_DAWG_FACTOR = 0.015
1218 GENERATE_WORD_BIGRAMS = 0
1219 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1220 TRAINING_DATA_ARGUMENTS += [
"--no_space_in_output",
"--desired_bigrams="]
1221 FILTER_ARGUMENTS += [
"--charset_filter=chi_sim",
"--segmenter_lang=chi_sim"]
1223 FONTS = CHI_SIM_FONTS
1224 elif lang ==
"chi_tra":
1226 WORD_DAWG_FACTOR = 0.015
1227 GENERATE_WORD_BIGRAMS = 0
1228 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1229 TRAINING_DATA_ARGUMENTS += [
"--no_space_in_output",
"--desired_bigrams="]
1230 FILTER_ARGUMENTS += [
"--charset_filter=chi_tr",
"--segmenter_lang=chi_tra"]
1232 FONTS = CHI_TRA_FONTS
1235 WORD_DAWG_FACTOR = 0.015
1236 GENERATE_WORD_BIGRAMS = 0
1237 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1238 TRAINING_DATA_ARGUMENTS += [
"--no_space_in_output",
"--desired_bigrams="]
1239 FILTER_ARGUMENTS += [
"--charset_filter=jpn",
"--segmenter_lang=jpn"]
1244 WORD_DAWG_FACTOR = 0.015
1245 NUMBER_DAWG_FACTOR = 0.05
1246 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1247 TRAINING_DATA_ARGUMENTS += [
"--desired_bigrams="]
1248 GENERATE_WORD_BIGRAMS = 0
1249 FILTER_ARGUMENTS += [
"--charset_filter=kor",
"--segmenter_lang=kor"]
1251 FONTS = KOREAN_FONTS
1256 FONTS = ARABIC_FONTS
1259 FONTS = THAANA_FONTS
1260 elif lang
in (
"fas",
"pus",
"snd",
"uig",
"urd"):
1262 FONTS = PERSIAN_FONTS
1263 elif lang
in (
"heb",
"yid"):
1264 NUMBER_DAWG_FACTOR = 0.05
1265 WORD_DAWG_FACTOR = 0.08
1267 FONTS = HEBREW_FONTS
1270 FONTS = SYRIAC_FONTS
1273 elif lang
in (
"amh",
"tir"):
1275 FONTS = AMHARIC_FONTS
1278 FONTS = [*NORTH_AMERICAN_ABORIGINAL_FONTS,
"Noto Sans Cherokee"]
1280 NUMBER_DAWG_FACTOR = 0.05
1281 WORD_DAWG_FACTOR = 0.08
1286 EXPOSURES =
"-3 -2 -1 0 1 2 3".split()
1288 FONTS = ANCIENT_GREEK_FONTS
1291 FONTS = ARMENIAN_FONTS
1294 FONTS = NORTH_AMERICAN_ABORIGINAL_FONTS
1297 FONTS = GEORGIAN_FONTS
1298 elif lang ==
"kat_old":
1299 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/kat.corpus.txt"
1301 FONTS = OLD_GEORGIAN_FONTS
1304 FONTS = KYRGYZ_FONTS
1305 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=100"]
1309 elif lang ==
"kur_ara":
1311 FONTS = KURDISH_FONTS
1313 raise ValueError(f
"Error: {lang} is not a valid language code")
1315 FLAGS_mean_count =
int(os.environ.get(
"FLAGS_mean_count", -1))
1316 if FLAGS_mean_count > 0:
1317 TRAINING_DATA_ARGUMENTS += [f
"--mean_count={FLAGS_mean_count}"]
1318 elif not MEAN_COUNT:
1319 TRAINING_DATA_ARGUMENTS += [f
"--mean_count={MEAN_COUNT}"]
1375 vars_to_transfer = {
1376 'ambigs_filter_denominator': AMBIGS_FILTER_DENOMINATOR,
1377 'bigram_dawg_factor': BIGRAM_DAWG_FACTOR,
1378 'exposures': EXPOSURES,
1379 'filter_arguments': FILTER_ARGUMENTS,
1381 'fragments_disabled': FRAGMENTS_DISABLED,
1382 'generate_word_bigrams': GENERATE_WORD_BIGRAMS,
1383 'lang_is_rtl': LANG_IS_RTL,
1385 'mean_count': MEAN_COUNT,
1386 'mix_lang': MIX_LANG,
1387 'norm_mode': NORM_MODE,
1388 'number_dawg_factor': NUMBER_DAWG_FACTOR,
1389 'punc_dawg_factor': PUNC_DAWG_FACTOR,
1390 'run_shape_clustering': RUN_SHAPE_CLUSTERING,
1391 'text2image_extra_args': TEXT2IMAGE_EXTRA_ARGS,
1392 'text_corpus': TEXT_CORPUS,
1393 'training_data_arguments': TRAINING_DATA_ARGUMENTS,
1394 'word_dawg_factor': WORD_DAWG_FACTOR,
1395 'word_dawg_size': WORD_DAWG_SIZE,
1396 'wordlist2dawg_arguments': WORDLIST2DAWG_ARGUMENTS,
1399 for attr, value
in vars_to_transfer.items():
1400 if hasattr(ctx, attr):
1401 if getattr(ctx, attr) != value:
1402 log.debug(f
"{attr} = {value} (was {getattr(ctx, attr)})")
1403 setattr(ctx, attr, value)
1405 log.debug(f
"{attr} = {value} (set on cmdline)")
1407 log.debug(f
"{attr} = {value}")
1408 setattr(ctx, attr, value)