25 log = logging.getLogger(__name__)
28 VALID_LANGUAGE_CODES = (
29 "afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat "
30 "ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo "
31 "ell eng enm epo est eus fas fil fin fra frk frm gle glg "
32 "grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old "
33 "jav jav_java jpn kan kat kat_old kaz khm kir kmr kor kur_ara lao lat "
34 "lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori "
35 "pan pol por pus ron rus san sin slk slv snd spa spa_old "
36 "sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur "
37 "uig ukr urd uzb uzb_cyrl vie yid gle_uncial "
41 UNUSABLE_LANGUAGE_CODES =
""
44 "CaslonishFraxx Medium",
45 "Cloister Black, Light",
58 "Courier New Bold Italic",
61 "Times New Roman, Bold",
62 "Times New Roman, Bold Italic",
63 "Times New Roman, Italic",
68 "Georgia Bold Italic",
70 "Trebuchet MS Bold Italic",
71 "Trebuchet MS Italic",
76 "Verdana Bold Italic",
78 "URW Bookman L Italic",
79 "URW Bookman L Bold Italic",
80 "Century Schoolbook L Bold",
81 "Century Schoolbook L Italic",
82 "Century Schoolbook L Bold Italic",
83 "Century Schoolbook L Medium",
84 "DejaVu Sans Ultra-Light",
92 "GFS Bodoni Bold Italic",
96 "GFS Didot Bold Italic",
103 "EB Garamond Italic",
107 "Junicode Bold Italic",
108 "IM FELL DW Pica PRO",
109 "IM FELL English PRO",
110 "IM FELL Double Pica PRO",
111 "IM FELL French Canon PRO",
112 "IM FELL Great Primer PRO",
113 "IM FELL DW Pica PRO Italic",
114 "IM FELL English PRO Italic",
115 "IM FELL Double Pica PRO Italic",
116 "IM FELL French Canon PRO Italic",
117 "IM FELL Great Primer PRO Italic",
120 IRISH_UNCIAL_FONTS = [
121 "Bunchlo Arsa Dubh GC",
123 "Bunchlo Arsa GC Bold",
127 "Bunchlo Nua GC Bold",
136 "Seanchló na Nod GC",
137 "Seanchló Ársa Dubh GC",
139 "Seanchló Ársa GC Bold",
146 EARLY_LATIN_FONTS = [
158 "Arial Unicode MS Bold",
163 "Courier New Bold Italic",
168 "Palatino Linotype Bold",
169 "Palatino Linotype Bold Italic",
170 "Palatino Linotype Italic",
172 "Really No 2 LT W2G Light",
173 "Really No 2 LT W2G Light Italic",
174 "Really No 2 LT W2G Medium",
175 "Really No 2 LT W2G Medium Italic",
176 "Really No 2 LT W2G Semi-Bold",
177 "Really No 2 LT W2G Semi-Bold Italic",
178 "Really No 2 LT W2G Ultra-Bold",
179 "Really No 2 LT W2G Ultra-Bold Italic",
180 "Times New Roman, Bold",
181 "Times New Roman, Bold Italic",
182 "Times New Roman, Italic",
187 "Verdana Bold Italic",
200 "Arial Unicode MS Bold",
202 "Noto Sans Devanagari Bold",
203 "Noto Sans Devanagari",
204 "Samyak Devanagari Medium",
211 "Santipur OT Medium",
218 "Kedage Bold Italic",
222 "Mallige Bold Italic",
224 "Arial Unicode MS Bold",
227 "Noto Sans Kannada Bold",
238 "Arial Unicode MS Bold",
249 "Noto Sans Telugu Bold",
255 "Sree Krushnadevaraya",
258 "Tenali Ramakrishna",
272 "Arial Unicode MS Bold",
274 "Droid Sans Tamil Bold",
276 "Karla Tamil Inclined Bold Italic",
277 "Karla Tamil Inclined Italic",
278 "Karla Tamil Upright Bold",
279 "Karla Tamil Upright",
280 "Noto Sans Tamil Bold",
282 "Noto Sans Tamil UI Bold",
283 "Noto Sans Tamil UI",
285 "Lohit Tamil Classical",
293 "Lucida Sans Typewriter",
300 "Lucida Sans Typewriter Bold",
301 "Lucida Sans Semi-Bold",
302 "Garuda Bold Oblique",
303 "Norasi Bold Italic",
304 "Norasi Bold Oblique",
306 "Arial Unicode MS Bold",
310 "Noto Serif Thai Bold",
317 "UtSaHaGumm LT Thai",
323 "Arial Unicode MS Bold",
324 "Baekmuk Batang Patched",
333 "AR PL UMing Patched Light",
335 "Arial Unicode MS Bold",
336 "WenQuanYi Zen Hei Medium",
341 "AR PL UMing TW MBE Light",
342 "AR PL UKai Patched",
343 "AR PL UMing Patched Light",
345 "Arial Unicode MS Bold",
346 "WenQuanYi Zen Hei Medium",
358 "Noto Sans Japanese Bold",
359 "Noto Sans Japanese Light",
368 "Courier New Bold Italic",
369 "Courier New Italic",
371 "Times New Roman, Bold",
372 "Times New Roman, Bold Italic",
373 "Times New Roman, Italic",
378 "Georgia Bold Italic",
380 "Trebuchet MS Bold Italic",
381 "Trebuchet MS Italic",
386 "Verdana Bold Italic",
388 "DejaVu Serif Oblique",
390 "DejaVu Serif Bold Oblique",
393 "FreeSerif Bold Italic",
394 "DejaVu Sans Ultra-Light",
399 "Arial Unicode MS Bold",
401 "DejaVu Sans Mono Oblique",
402 "DejaVu Sans Mono Bold",
403 "DejaVu Sans Mono Bold Oblique",
405 "DejaVu Serif Semi-Condensed",
406 "DejaVu Serif Oblique",
408 "DejaVu Serif Bold Oblique",
409 "DejaVu Serif Bold Semi-Condensed",
411 "FreeSerif Bold Italic",
416 "Linux Biolinum O Bold",
418 "Linux Libertine O Bold",
420 "Linux Libertine O Bold Italic",
421 "Linux Libertine O Italic",
422 "Palatino Linotype Bold",
423 "Palatino Linotype Bold Italic",
424 "Palatino Linotype Italic",
430 ANCIENT_GREEK_FONTS = [
432 "GFS Artemisia Bold",
433 "GFS Artemisia Bold Italic",
434 "GFS Artemisia Italic",
437 "GFS Bodoni Bold Italic",
441 "GFS Didot Bold Italic",
445 "GFS Neohellenic Bold",
446 "GFS Neohellenic Bold Italic",
447 "GFS Neohellenic Italic",
455 "Arabic Transparent Bold",
456 "Arabic Transparent",
458 "Arial Unicode MS Bold",
467 "Frutiger LT Arabic Bold",
468 "Frutiger LT Arabic",
478 "Palatino LT Arabic",
479 "Palatino Sans Arabic Bold",
480 "Palatino Sans Arabic",
481 "Simplified Arabic Bold",
483 "Times New Roman, Bold",
485 "Traditional Arabic Bold",
486 "Traditional Arabic",
495 "Courier New Bold Italic",
496 "Courier New Italic",
498 "Ergo Hebrew Semi-Bold",
499 "Ergo Hebrew Semi-Bold Italic",
501 "Ergo Hebrew Italic",
502 "Really No 2 LT W2G Light",
503 "Really No 2 LT W2G Light Italic",
504 "Really No 2 LT W2G Medium",
505 "Really No 2 LT W2G Medium Italic",
506 "Really No 2 LT W2G Semi-Bold",
507 "Really No 2 LT W2G Semi-Bold Italic",
508 "Really No 2 LT W2G Ultra-Bold",
509 "Really No 2 LT W2G Ultra-Bold Italic",
510 "Times New Roman, Bold",
511 "Times New Roman, Bold Italic",
512 "Times New Roman, Italic",
523 "Jamrul Medium Semi-Expanded",
525 "Arial Unicode MS Bold",
531 "Noto Sans Bengali Bold",
546 "Courier New Italic",
547 "Courier New Bold Italic",
549 "Times New Roman, Bold",
550 "Times New Roman, Bold Italic",
551 "Times New Roman, Italic",
553 "DejaVu Serif Oblique",
555 "DejaVu Serif Bold Oblique",
558 "FreeSerif Bold Italic",
566 "Andale Sans Arabic Farsi",
568 "Arial Unicode MS Bold",
571 "Lucida Sans Oblique",
572 "Lucida Sans Semi-Bold",
574 "Lucida Sans Typewriter Bold",
575 "Lucida Sans Typewriter Oblique",
576 "Lucida Sans Typewriter",
580 "Times New Roman, Bold",
581 "Times New Roman, Bold Italic",
582 "Times New Roman, Italic",
583 "Yakout Linotype Bold",
589 "Droid Sans Ethiopic Bold",
590 "Droid Sans Ethiopic",
592 "Noto Sans Ethiopic Bold",
593 "Noto Sans Ethiopic",
598 "Arial Unicode MS Bold",
609 "Noto Sans Myanmar Bold",
616 JAVANESE_FONTS = [
"Prada"]
618 NORTH_AMERICAN_ABORIGINAL_FONTS = [
620 "Aboriginal Sans Bold Italic",
621 "Aboriginal Sans Italic",
622 "Aboriginal Sans Bold",
623 "Aboriginal Serif Bold",
624 "Aboriginal Serif Bold Italic",
625 "Aboriginal Serif Italic",
630 "Arial Unicode MS Bold",
632 "BPG Algeti GPL\&GNU",
633 "BPG Chveulebrivi GPL\&GNU",
634 "BPG Courier GPL\&GNU",
635 "BPG Courier S GPL\&GNU",
636 "BPG DejaVu Sans 2011 GNU-GPL",
637 "BPG Elite GPL\&GNU",
638 "BPG Excelsior GPL\&GNU",
639 "BPG Glaho GPL\&GNU",
640 "BPG Gorda GPL\&GNU",
641 "BPG Ingiri GPL\&GNU",
642 "BPG Mrgvlovani Caps GNU\&GPL",
643 "BPG Mrgvlovani GPL\&GNU",
644 "BPG Nateli Caps GPL\&GNU Light",
645 "BPG Nateli Condenced GPL\&GNU Light",
646 "BPG Nateli GPL\&GNU Light",
647 "BPG Nino Medium Cond GPL\&GNU",
648 "BPG Nino Medium GPL\&GNU Medium",
650 "BPG Sans Medium GPL\&GNU",
651 "BPG Sans Modern GPL\&GNU",
652 "BPG Sans Regular GPL\&GNU",
653 "BPG Serif GPL\&GNU",
654 "BPG Serif Modern GPL\&GNU",
656 "FreeMono Bold Italic",
660 "FreeSerif Bold Italic",
664 OLD_GEORGIAN_FONTS = [
665 "Arial Unicode MS Bold",
667 "BPG Algeti GPL\&GNU",
668 "BPG Courier S GPL\&GNU",
669 "BPG DejaVu Sans 2011 GNU-GPL",
670 "BPG Elite GPL\&GNU",
671 "BPG Excelsior GPL\&GNU",
672 "BPG Glaho GPL\&GNU",
673 "BPG Ingiri GPL\&GNU",
674 "BPG Mrgvlovani Caps GNU\&GPL",
675 "BPG Mrgvlovani GPL\&GNU",
676 "BPG Nateli Caps GPL\&GNU Light",
677 "BPG Nateli Condenced GPL\&GNU Light",
678 "BPG Nateli GPL\&GNU Light",
679 "BPG Nino Medium Cond GPL\&GNU",
680 "BPG Nino Medium GPL\&GNU Medium",
682 "BPG Sans Medium GPL\&GNU",
683 "BPG Sans Modern GPL\&GNU",
684 "BPG Sans Regular GPL\&GNU",
685 "BPG Serif GPL\&GNU",
686 "BPG Serif Modern GPL\&GNU",
690 "FreeSerif Bold Italic",
697 "Khmer OS Battambang",
702 "Khmer OS Metal Chrieng",
703 "Khmer OS Muol Light",
704 "Khmer OS Muol Pali",
709 "Noto Serif Khmer Bold",
710 "Noto Serif Khmer Light",
719 "Arial Unicode MS Bold",
722 "Lucida Sans Oblique",
723 "Lucida Sans Semi-Bold",
725 "Lucida Sans Typewriter Bold",
726 "Lucida Sans Typewriter Oblique",
727 "Lucida Sans Typewriter",
731 "Times New Roman, Bold",
732 "Times New Roman, Bold Italic",
733 "Times New Roman, Italic",
735 "Yakout Linotype Bold",
742 "Arial Unicode MS Bold",
749 "Noto Sans Lao Bold",
751 "Noto Sans Lao UI Bold",
753 "Noto Serif Lao Bold",
763 "Samyak Gujarati Medium",
768 "Arial Unicode MS Bold",
771 "Noto Sans Gujarati Bold",
772 "Noto Sans Gujarati",
780 "Arial Unicode MS Bold",
789 "Noto Sans Malayalam Bold",
790 "Noto Sans Malayalam",
799 "Arial Unicode MS Bold",
802 "Samyak Oriya Medium",
808 "Arial Unicode MS Bold",
812 "Noto Sans Gurmukhi",
813 "Noto Sans Gurmukhi Bold",
820 "Noto Sans Sinhala Bold",
829 "East Syriac Adiabene",
830 "East Syriac Ctesiphon",
831 "Estrangelo Antioch",
834 "Estrangelo Nisibin",
835 "Estrangelo Quenneshrin",
837 "Estrangelo TurAbdin",
840 "Serto Jerusalem Bold",
841 "Serto Jerusalem Italic",
852 THAANA_FONTS = [
"FreeSerif"]
856 "Arial Unicode MS Bold",
862 "Tibetan Machine Uni",
871 "AR PL UKai Patched",
872 "AR PL UMing Patched Light",
873 "Baekmuk Batang Patched",
876 FLAGS_webtext_prefix = os.environ.get(
"FLAGS_webtext_prefix",
"")
896 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/{lang}.corpus.txt"
897 FILTER_ARGUMENTS = []
898 WORDLIST2DAWG_ARGUMENTS =
""
904 PUNC_DAWG_FACTOR =
None
905 NUMBER_DAWG_FACTOR = 0.125
906 WORD_DAWG_FACTOR = 0.05
907 BIGRAM_DAWG_FACTOR = 0.015
908 TRAINING_DATA_ARGUMENTS = []
909 FRAGMENTS_DISABLED =
"y"
910 RUN_SHAPE_CLUSTERING =
False
911 AMBIGS_FILTER_DENOMINATOR =
"100000"
918 TEXT2IMAGE_EXTRA_ARGS = []
921 GENERATE_WORD_BIGRAMS =
None
922 WORD_DAWG_SIZE =
None
926 TEXT2IMAGE_EXTRA_ARGS += [
"--ligatures"]
928 FONTS = EARLY_LATIN_FONTS
930 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/fra.corpus.txt"
932 FILTER_ARGUMENTS += [
"--make_early_language_variant=fra"]
933 TEXT2IMAGE_EXTRA_ARGS += [
"--ligatures"]
935 FONTS = EARLY_LATIN_FONTS
937 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/deu.corpus.txt"
939 FONTS = FRAKTUR_FONTS
940 elif lang ==
"ita_old":
941 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/ita.corpus.txt"
943 FILTER_ARGUMENTS += [
"--make_early_language_variant=ita"]
944 TEXT2IMAGE_EXTRA_ARGS += [
"--ligatures"]
946 FONTS = EARLY_LATIN_FONTS
949 EXPOSURES =
"-3 -2 -1 0 1 2 3".split()
951 FONTS = NEOLATIN_FONTS
952 elif lang ==
"spa_old":
953 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/spa.corpus.txt"
955 FILTER_ARGUMENTS += [
"--make_early_language_variant=spa"]
956 TEXT2IMAGE_EXTRA_ARGS += [
"--ligatures"]
958 FONTS = EARLY_LATIN_FONTS
959 elif lang ==
"srp_latn":
960 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/srp.corpus.txt"
962 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
964 FONTS = VIETNAMESE_FONTS
968 WORD_DAWG_SIZE = 1_000_000
970 WORD_DAWG_SIZE = 1_000_000
984 PUNC_DAWG_FACTOR = 0.004
990 WORD_DAWG_FACTOR = 0.125
992 WORD_DAWG_FACTOR = 0.03
1004 WORD_DAWG_FACTOR = 0.08
1007 elif lang ==
"gle_uncial":
1009 FONTS = IRISH_UNCIAL_FONTS
1016 elif lang ==
"iast":
1035 WORD_DAWG_FACTOR = 0.02
1065 elif lang ==
"lat_lid":
1066 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/lat_lid.corpus.txt"
1067 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1068 GENERATE_WORD_BIGRAMS = 0
1071 WORD_DAWG_SIZE = 1_000_000
1073 FONTS = EARLY_LATIN_FONTS
1078 FONTS = RUSSIAN_FONTS
1080 NUMBER_DAWG_FACTOR = 0.05
1081 WORD_DAWG_SIZE = 1_000_000
1093 MIX_LANG = f
"{lang}"
1095 FONTS = RUSSIAN_FONTS
1100 elif lang ==
"cyr_lid":
1101 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/cyr_lid.corpus.txt"
1102 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1103 GENERATE_WORD_BIGRAMS = 0
1104 WORD_DAWG_SIZE = 1_000_000
1106 FONTS = RUSSIAN_FONTS
1110 elif lang
in (
"asm",
"ben"):
1112 WORD_DAWG_FACTOR = 0.15
1114 FONTS = BENGALI_FONTS
1115 elif lang
in (
"bih",
"hin",
"mar",
"nep",
"san"):
1117 WORD_DAWG_FACTOR = 0.15
1119 FONTS = DEVANAGARI_FONTS
1122 WORD_DAWG_FACTOR = 0.15
1124 FONTS = TIBETAN_FONTS
1126 WORD_DAWG_FACTOR = 0.01
1128 FONTS = TIBETAN_FONTS
1131 WORD_DAWG_FACTOR = 0.15
1133 FONTS = GUJARATI_FONTS
1136 WORD_DAWG_FACTOR = 0.15
1137 TRAINING_DATA_ARGUMENTS += [
"--no_newline_in_output"]
1138 TEXT2IMAGE_EXTRA_ARGS += [
"--char_spacing=0.5"]
1140 FONTS = KANNADA_FONTS
1143 WORD_DAWG_FACTOR = 0.15
1144 TRAINING_DATA_ARGUMENTS += [
"--no_newline_in_output"]
1145 TEXT2IMAGE_EXTRA_ARGS += [
"--char_spacing=0.5"]
1147 FONTS = MALAYALAM_FONTS
1149 WORD_DAWG_FACTOR = 0.01
1154 WORD_DAWG_FACTOR = 0.01
1156 FONTS = PUNJABI_FONTS
1159 WORD_DAWG_FACTOR = 0.01
1161 FONTS = SINHALA_FONTS
1164 WORD_DAWG_FACTOR = 0.15
1165 TRAINING_DATA_ARGUMENTS += [
"--no_newline_in_output"]
1166 TEXT2IMAGE_EXTRA_ARGS += [
"--char_spacing=0.5"]
1171 WORD_DAWG_FACTOR = 0.15
1172 TRAINING_DATA_ARGUMENTS += [
"--no_newline_in_output"]
1173 TEXT2IMAGE_EXTRA_ARGS += [
"--char_spacing=0.5"]
1175 FONTS = TELUGU_FONTS
1178 elif lang ==
"jav_java":
1180 WORD_DAWG_FACTOR = 0.15
1181 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1183 FONTS = JAVANESE_FONTS
1186 WORD_DAWG_FACTOR = 0.15
1187 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1192 WORD_DAWG_FACTOR = 0.15
1193 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1195 FONTS = LAOTHIAN_FONTS
1198 WORD_DAWG_FACTOR = 0.15
1199 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1201 FONTS = BURMESE_FONTS
1204 WORD_DAWG_FACTOR = 0.01
1205 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1206 FILTER_ARGUMENTS += [
"--segmenter_lang=tha"]
1207 TRAINING_DATA_ARGUMENTS += [
"--no_space_in_output",
"--desired_bigrams="]
1208 AMBIGS_FILTER_DENOMINATOR =
"1000"
1214 elif lang ==
"chi_sim":
1216 PUNC_DAWG_FACTOR = 0.015
1217 WORD_DAWG_FACTOR = 0.015
1218 GENERATE_WORD_BIGRAMS = 0
1219 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1220 TRAINING_DATA_ARGUMENTS += [
"--no_space_in_output",
"--desired_bigrams="]
1221 FILTER_ARGUMENTS += [
"--charset_filter=chi_sim",
"--segmenter_lang=chi_sim"]
1223 FONTS = CHI_SIM_FONTS
1224 elif lang ==
"chi_tra":
1226 WORD_DAWG_FACTOR = 0.015
1227 GENERATE_WORD_BIGRAMS = 0
1228 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1229 TRAINING_DATA_ARGUMENTS += [
"--no_space_in_output",
"--desired_bigrams="]
1230 FILTER_ARGUMENTS += [
"--charset_filter=chi_tr",
"--segmenter_lang=chi_tra"]
1232 FONTS = CHI_TRA_FONTS
1235 WORD_DAWG_FACTOR = 0.015
1236 GENERATE_WORD_BIGRAMS = 0
1237 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1238 TRAINING_DATA_ARGUMENTS += [
"--no_space_in_output",
"--desired_bigrams="]
1239 FILTER_ARGUMENTS += [
"--charset_filter=jpn",
"--segmenter_lang=jpn"]
1244 WORD_DAWG_FACTOR = 0.015
1245 NUMBER_DAWG_FACTOR = 0.05
1246 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=10000"]
1247 TRAINING_DATA_ARGUMENTS += [
"--desired_bigrams="]
1248 GENERATE_WORD_BIGRAMS = 0
1249 FILTER_ARGUMENTS += [
"--charset_filter=kor",
"--segmenter_lang=kor"]
1251 FONTS = KOREAN_FONTS
1256 FONTS = ARABIC_FONTS
1259 FONTS = THAANA_FONTS
1260 elif lang
in (
"fas",
"pus",
"snd",
"uig",
"urd"):
1262 FONTS = PERSIAN_FONTS
1263 elif lang
in (
"heb",
"yid"):
1264 NUMBER_DAWG_FACTOR = 0.05
1265 WORD_DAWG_FACTOR = 0.08
1267 FONTS = HEBREW_FONTS
1270 FONTS = SYRIAC_FONTS
1273 elif lang
in (
"amh",
"tir"):
1275 FONTS = AMHARIC_FONTS
1278 FONTS = [*NORTH_AMERICAN_ABORIGINAL_FONTS,
"Noto Sans Cherokee"]
1280 NUMBER_DAWG_FACTOR = 0.05
1281 WORD_DAWG_FACTOR = 0.08
1286 EXPOSURES =
"-3 -2 -1 0 1 2 3".split()
1288 FONTS = ANCIENT_GREEK_FONTS
1291 FONTS = ARMENIAN_FONTS
1294 FONTS = NORTH_AMERICAN_ABORIGINAL_FONTS
1297 FONTS = GEORGIAN_FONTS
1298 elif lang ==
"kat_old":
1299 TEXT_CORPUS = f
"{FLAGS_webtext_prefix}/kat.corpus.txt"
1301 FONTS = OLD_GEORGIAN_FONTS
1304 FONTS = KYRGYZ_FONTS
1305 TRAINING_DATA_ARGUMENTS += [
"--infrequent_ratio=100"]
1309 elif lang ==
"kur_ara":
1311 FONTS = KURDISH_FONTS
1313 raise ValueError(f
"Error: {lang} is not a valid language code")
1315 FLAGS_mean_count =
int(os.environ.get(
"FLAGS_mean_count", -1))
1316 if FLAGS_mean_count > 0:
1317 TRAINING_DATA_ARGUMENTS += [f
"--mean_count={FLAGS_mean_count}"]
1318 elif not MEAN_COUNT:
1319 TRAINING_DATA_ARGUMENTS += [f
"--mean_count={MEAN_COUNT}"]
1375 vars_to_transfer = {
1376 'ambigs_filter_denominator': AMBIGS_FILTER_DENOMINATOR,
1377 'bigram_dawg_factor': BIGRAM_DAWG_FACTOR,
1378 'exposures': EXPOSURES,
1379 'filter_arguments': FILTER_ARGUMENTS,
1381 'fragments_disabled': FRAGMENTS_DISABLED,
1382 'generate_word_bigrams': GENERATE_WORD_BIGRAMS,
1383 'lang_is_rtl': LANG_IS_RTL,
1385 'mean_count': MEAN_COUNT,
1386 'mix_lang': MIX_LANG,
1387 'norm_mode': NORM_MODE,
1388 'number_dawg_factor': NUMBER_DAWG_FACTOR,
1389 'punc_dawg_factor': PUNC_DAWG_FACTOR,
1390 'run_shape_clustering': RUN_SHAPE_CLUSTERING,
1391 'text2image_extra_args': TEXT2IMAGE_EXTRA_ARGS,
1392 'text_corpus': TEXT_CORPUS,
1393 'training_data_arguments': TRAINING_DATA_ARGUMENTS,
1394 'word_dawg_factor': WORD_DAWG_FACTOR,
1395 'word_dawg_size': WORD_DAWG_SIZE,
1396 'wordlist2dawg_arguments': WORDLIST2DAWG_ARGUMENTS,
1399 for attr, value
in vars_to_transfer.items():
1400 if hasattr(ctx, attr):
1401 if getattr(ctx, attr) != value:
1402 log.debug(f
"{attr} = {value} (was {getattr(ctx, attr)})")
1403 setattr(ctx, attr, value)
1405 log.debug(f
"{attr} = {value} (set on cmdline)")
1407 log.debug(f
"{attr} = {value}")
1408 setattr(ctx, attr, value)