tesseract  5.0.0-alpha-619-ge9db
language_specific Namespace Reference

Functions

def set_lang_specific_parameters (ctx, lang)
 

Variables

 log = logging.getLogger(__name__)
 
tuple VALID_LANGUAGE_CODES
 
string UNUSABLE_LANGUAGE_CODES = ""
 
list FRAKTUR_FONTS
 
list LATIN_FONTS
 
list NEOLATIN_FONTS
 
list IRISH_UNCIAL_FONTS
 
list EARLY_LATIN_FONTS
 
list VIETNAMESE_FONTS
 
list DEVANAGARI_FONTS
 
list KANNADA_FONTS
 
list TELUGU_FONTS
 
list TAMIL_FONTS
 
list THAI_FONTS
 
list KOREAN_FONTS
 
list CHI_SIM_FONTS
 
list CHI_TRA_FONTS
 
list JPN_FONTS
 
list RUSSIAN_FONTS
 
list GREEK_FONTS
 
list ANCIENT_GREEK_FONTS
 
list ARABIC_FONTS
 
list HEBREW_FONTS
 
list BENGALI_FONTS
 
list KYRGYZ_FONTS
 
list PERSIAN_FONTS
 
list AMHARIC_FONTS
 
list ARMENIAN_FONTS
 
list BURMESE_FONTS
 
list JAVANESE_FONTS = ["Prada"]
 
list NORTH_AMERICAN_ABORIGINAL_FONTS
 
list GEORGIAN_FONTS
 
list OLD_GEORGIAN_FONTS
 
list KHMER_FONTS
 
list KURDISH_FONTS
 
list LAOTHIAN_FONTS
 
list GUJARATI_FONTS
 
list MALAYALAM_FONTS
 
list ORIYA_FONTS
 
list PUNJABI_FONTS
 
list SINHALA_FONTS
 
list SYRIAC_FONTS
 
list THAANA_FONTS = ["FreeSerif"]
 
list TIBETAN_FONTS
 
list VERTICAL_FONTS
 
 FLAGS_webtext_prefix = os.environ.get("FLAGS_webtext_prefix", "")
 

Function Documentation

◆ set_lang_specific_parameters()

def language_specific.set_lang_specific_parameters (   ctx,
  lang 
)

Definition at line 894 of file language_specific.py.

894 def set_lang_specific_parameters(ctx, lang):
895  # The default text location is now given directly from the language code.
896  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/{lang}.corpus.txt"
897  FILTER_ARGUMENTS = []
898  WORDLIST2DAWG_ARGUMENTS = ""
899  # These dawg factors represent the fraction of the corpus not covered by the
900  # dawg, and seem like reasonable defaults, but the optimal value is likely
901  # to be highly corpus-dependent, as well as somewhat language-dependent.
902  # Number dawg factor is the fraction of all numeric strings that are not
903  # covered, which is why it is higher relative to the others.
904  PUNC_DAWG_FACTOR = None
905  NUMBER_DAWG_FACTOR = 0.125
906  WORD_DAWG_FACTOR = 0.05
907  BIGRAM_DAWG_FACTOR = 0.015
908  TRAINING_DATA_ARGUMENTS = []
909  FRAGMENTS_DISABLED = "y"
910  RUN_SHAPE_CLUSTERING = False
911  AMBIGS_FILTER_DENOMINATOR = "100000"
912  LEADING = 32
913  MEAN_COUNT = 40 # Default for latin script.
914  # Language to mix with the language for maximum accuracy. Defaults to eng.
915  # If no language is good, set to the base language.
916  MIX_LANG = "eng"
917  FONTS = ctx.fonts
918  TEXT2IMAGE_EXTRA_ARGS = []
919  EXPOSURES = []
920 
921  GENERATE_WORD_BIGRAMS = None
922  WORD_DAWG_SIZE = None
923 
924  # Latin languages.
925  if lang == "enm":
926  TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported
927  if not FONTS:
928  FONTS = EARLY_LATIN_FONTS
929  elif lang == "frm":
930  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/fra.corpus.txt"
931  # Make long-s substitutions for Middle French text
932  FILTER_ARGUMENTS += ["--make_early_language_variant=fra"]
933  TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported.
934  if not FONTS:
935  FONTS = EARLY_LATIN_FONTS
936  elif lang == "frk":
937  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/deu.corpus.txt"
938  if not FONTS:
939  FONTS = FRAKTUR_FONTS
940  elif lang == "ita_old":
941  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/ita.corpus.txt"
942  # Make long-s substitutions for Early Italian text
943  FILTER_ARGUMENTS += ["--make_early_language_variant=ita"]
944  TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported.
945  if not FONTS:
946  FONTS = EARLY_LATIN_FONTS
947  elif lang == "lat":
948  if not EXPOSURES:
949  EXPOSURES = "-3 -2 -1 0 1 2 3".split()
950  if not FONTS:
951  FONTS = NEOLATIN_FONTS
952  elif lang == "spa_old":
953  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/spa.corpus.txt"
954  # Make long-s substitutions for Early Spanish text
955  FILTER_ARGUMENTS += ["--make_early_language_variant=spa"]
956  TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported.
957  if not FONTS:
958  FONTS = EARLY_LATIN_FONTS
959  elif lang == "srp_latn":
960  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/srp.corpus.txt"
961  elif lang == "vie":
962  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
963  if not FONTS:
964  FONTS = VIETNAMESE_FONTS
965  # Highly inflective languages get a bigger dawg size.
966  # TODO(rays) Add more here!
967  elif lang == "hun":
968  WORD_DAWG_SIZE = 1_000_000
969  elif lang == "pol":
970  WORD_DAWG_SIZE = 1_000_000
971 
972  # Latin with default treatment.
973  elif lang == "afr":
974  pass
975  elif lang == "aze":
976  pass
977  elif lang == "bos":
978  pass
979  elif lang == "cat":
980  pass
981  elif lang == "ceb":
982  pass
983  elif lang == "ces":
984  PUNC_DAWG_FACTOR = 0.004
985  elif lang == "cym":
986  pass
987  elif lang == "dan":
988  pass
989  elif lang == "deu":
990  WORD_DAWG_FACTOR = 0.125
991  elif lang == "eng":
992  WORD_DAWG_FACTOR = 0.03
993  elif lang == "epo":
994  pass
995  elif lang == "est":
996  pass
997  elif lang == "eus":
998  pass
999  elif lang == "fil":
1000  pass
1001  elif lang == "fin":
1002  pass
1003  elif lang == "fra":
1004  WORD_DAWG_FACTOR = 0.08
1005  elif lang == "gle":
1006  pass
1007  elif lang == "gle_uncial":
1008  if not FONTS:
1009  FONTS = IRISH_UNCIAL_FONTS
1010  elif lang == "glg":
1011  pass
1012  elif lang == "hat":
1013  pass
1014  elif lang == "hrv":
1015  pass
1016  elif lang == "iast":
1017  pass
1018  elif lang == "ind":
1019  pass
1020  elif lang == "isl":
1021  pass
1022  elif lang == "ita":
1023  pass
1024  elif lang == "jav":
1025  pass
1026  elif lang == "lav":
1027  pass
1028  elif lang == "lit":
1029  pass
1030  elif lang == "mlt":
1031  pass
1032  elif lang == "msa":
1033  pass
1034  elif lang == "nld":
1035  WORD_DAWG_FACTOR = 0.02
1036  elif lang == "nor":
1037  pass
1038  elif lang == "por":
1039  pass
1040  elif lang == "ron":
1041  pass
1042  elif lang == "slk":
1043  pass
1044  elif lang == "slv":
1045  pass
1046  elif lang == "spa":
1047  pass
1048  elif lang == "sqi":
1049  pass
1050  elif lang == "swa":
1051  pass
1052  elif lang == "swe":
1053  pass
1054  elif lang == "tgl":
1055  pass
1056  elif lang == "tur":
1057  pass
1058  elif lang == "uzb":
1059  pass
1060  elif lang == "zlm":
1061  pass
1062 
1063  # Special code for performing language-id that is trained on
1064  # EFIGS+Latin+Vietnamese text with regular + fraktur fonts.
1065  elif lang == "lat_lid":
1066  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/lat_lid.corpus.txt"
1067  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1068  GENERATE_WORD_BIGRAMS = 0
1069  # Strip unrenderable words as not all fonts will render the extended
1070  # latin symbols found in Vietnamese text.
1071  WORD_DAWG_SIZE = 1_000_000
1072  if not FONTS:
1073  FONTS = EARLY_LATIN_FONTS
1074 
1075  # Cyrillic script-based languages. It is bad to mix Latin with Cyrillic.
1076  elif lang == "rus":
1077  if not FONTS:
1078  FONTS = RUSSIAN_FONTS
1079  MIX_LANG = "rus"
1080  NUMBER_DAWG_FACTOR = 0.05
1081  WORD_DAWG_SIZE = 1_000_000
1082  elif lang in (
1083  "aze_cyrl",
1084  "bel",
1085  "bul",
1086  "kaz",
1087  "mkd",
1088  "srp",
1089  "tgk",
1090  "ukr",
1091  "uzb_cyrl",
1092  ):
1093  MIX_LANG = f"{lang}"
1094  if not FONTS:
1095  FONTS = RUSSIAN_FONTS
1096 
1097  # Special code for performing Cyrillic language-id that is trained on
1098  # Russian, Serbian, Ukrainian, Belarusian, Macedonian, Tajik and Mongolian
1099  # text with the list of Russian fonts.
1100  elif lang == "cyr_lid":
1101  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/cyr_lid.corpus.txt"
1102  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1103  GENERATE_WORD_BIGRAMS = 0
1104  WORD_DAWG_SIZE = 1_000_000
1105  if not FONTS:
1106  FONTS = RUSSIAN_FONTS
1107 
1108  # South Asian scripts mostly have a lot of different graphemes, so trim
1109  # down the MEAN_COUNT so as not to get a huge amount of text.
1110  elif lang in ("asm", "ben"):
1111  MEAN_COUNT = 15
1112  WORD_DAWG_FACTOR = 0.15
1113  if not FONTS:
1114  FONTS = BENGALI_FONTS
1115  elif lang in ("bih", "hin", "mar", "nep", "san"):
1116  MEAN_COUNT = 15
1117  WORD_DAWG_FACTOR = 0.15
1118  if not FONTS:
1119  FONTS = DEVANAGARI_FONTS
1120  elif lang == "bod":
1121  MEAN_COUNT = 15
1122  WORD_DAWG_FACTOR = 0.15
1123  if not FONTS:
1124  FONTS = TIBETAN_FONTS
1125  elif lang == "dzo":
1126  WORD_DAWG_FACTOR = 0.01
1127  if not FONTS:
1128  FONTS = TIBETAN_FONTS
1129  elif lang == "guj":
1130  MEAN_COUNT = 15
1131  WORD_DAWG_FACTOR = 0.15
1132  if not FONTS:
1133  FONTS = GUJARATI_FONTS
1134  elif lang == "kan":
1135  MEAN_COUNT = 15
1136  WORD_DAWG_FACTOR = 0.15
1137  TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1138  TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1139  if not FONTS:
1140  FONTS = KANNADA_FONTS
1141  elif lang == "mal":
1142  MEAN_COUNT = 15
1143  WORD_DAWG_FACTOR = 0.15
1144  TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1145  TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1146  if not FONTS:
1147  FONTS = MALAYALAM_FONTS
1148  elif lang == "ori":
1149  WORD_DAWG_FACTOR = 0.01
1150  if not FONTS:
1151  FONTS = ORIYA_FONTS
1152  elif lang == "pan":
1153  MEAN_COUNT = 15
1154  WORD_DAWG_FACTOR = 0.01
1155  if not FONTS:
1156  FONTS = PUNJABI_FONTS
1157  elif lang == "sin":
1158  MEAN_COUNT = 15
1159  WORD_DAWG_FACTOR = 0.01
1160  if not FONTS:
1161  FONTS = SINHALA_FONTS
1162  elif lang == "tam":
1163  MEAN_COUNT = 30
1164  WORD_DAWG_FACTOR = 0.15
1165  TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1166  TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1167  if not FONTS:
1168  FONTS = TAMIL_FONTS
1169  elif lang == "tel":
1170  MEAN_COUNT = 15
1171  WORD_DAWG_FACTOR = 0.15
1172  TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1173  TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1174  if not FONTS:
1175  FONTS = TELUGU_FONTS
1176 
1177  # SouthEast Asian scripts.
1178  elif lang == "jav_java":
1179  MEAN_COUNT = 15
1180  WORD_DAWG_FACTOR = 0.15
1181  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1182  if not FONTS:
1183  FONTS = JAVANESE_FONTS
1184  elif lang == "khm":
1185  MEAN_COUNT = 15
1186  WORD_DAWG_FACTOR = 0.15
1187  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1188  if not FONTS:
1189  FONTS = KHMER_FONTS
1190  elif lang == "lao":
1191  MEAN_COUNT = 15
1192  WORD_DAWG_FACTOR = 0.15
1193  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1194  if not FONTS:
1195  FONTS = LAOTHIAN_FONTS
1196  elif lang == "mya":
1197  MEAN_COUNT = 12
1198  WORD_DAWG_FACTOR = 0.15
1199  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1200  if not FONTS:
1201  FONTS = BURMESE_FONTS
1202  elif lang == "tha":
1203  MEAN_COUNT = 30
1204  WORD_DAWG_FACTOR = 0.01
1205  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1206  FILTER_ARGUMENTS += ["--segmenter_lang=tha"]
1207  TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1208  AMBIGS_FILTER_DENOMINATOR = "1000"
1209  LEADING = 48
1210  if not FONTS:
1211  FONTS = THAI_FONTS
1212 
1213  # CJK
1214  elif lang == "chi_sim":
1215  MEAN_COUNT = 15
1216  PUNC_DAWG_FACTOR = 0.015
1217  WORD_DAWG_FACTOR = 0.015
1218  GENERATE_WORD_BIGRAMS = 0
1219  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1220  TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1221  FILTER_ARGUMENTS += ["--charset_filter=chi_sim", "--segmenter_lang=chi_sim"]
1222  if not FONTS:
1223  FONTS = CHI_SIM_FONTS
1224  elif lang == "chi_tra":
1225  MEAN_COUNT = 15
1226  WORD_DAWG_FACTOR = 0.015
1227  GENERATE_WORD_BIGRAMS = 0
1228  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1229  TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1230  FILTER_ARGUMENTS += ["--charset_filter=chi_tr", "--segmenter_lang=chi_tra"]
1231  if not FONTS:
1232  FONTS = CHI_TRA_FONTS
1233  elif lang == "jpn":
1234  MEAN_COUNT = 15
1235  WORD_DAWG_FACTOR = 0.015
1236  GENERATE_WORD_BIGRAMS = 0
1237  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1238  TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1239  FILTER_ARGUMENTS += ["--charset_filter=jpn", "--segmenter_lang=jpn"]
1240  if not FONTS:
1241  FONTS = JPN_FONTS
1242  elif lang == "kor":
1243  MEAN_COUNT = 20
1244  WORD_DAWG_FACTOR = 0.015
1245  NUMBER_DAWG_FACTOR = 0.05
1246  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1247  TRAINING_DATA_ARGUMENTS += ["--desired_bigrams="]
1248  GENERATE_WORD_BIGRAMS = 0
1249  FILTER_ARGUMENTS += ["--charset_filter=kor", "--segmenter_lang=kor"]
1250  if not FONTS:
1251  FONTS = KOREAN_FONTS
1252 
1253  # Middle-Eastern scripts.
1254  elif lang == "ara":
1255  if not FONTS:
1256  FONTS = ARABIC_FONTS
1257  elif lang == "div":
1258  if not FONTS:
1259  FONTS = THAANA_FONTS
1260  elif lang in ("fas", "pus", "snd", "uig", "urd"):
1261  if not FONTS:
1262  FONTS = PERSIAN_FONTS
1263  elif lang in ("heb", "yid"):
1264  NUMBER_DAWG_FACTOR = 0.05
1265  WORD_DAWG_FACTOR = 0.08
1266  if not FONTS:
1267  FONTS = HEBREW_FONTS
1268  elif lang == "syr":
1269  if not FONTS:
1270  FONTS = SYRIAC_FONTS
1271 
1272  # Other scripts.
1273  elif lang in ("amh", "tir"):
1274  if not FONTS:
1275  FONTS = AMHARIC_FONTS
1276  elif lang == "chr":
1277  if not FONTS:
1278  FONTS = [*NORTH_AMERICAN_ABORIGINAL_FONTS, "Noto Sans Cherokee"]
1279  elif lang == "ell":
1280  NUMBER_DAWG_FACTOR = 0.05
1281  WORD_DAWG_FACTOR = 0.08
1282  if not FONTS:
1283  FONTS = GREEK_FONTS
1284  elif lang == "grc":
1285  if not EXPOSURES:
1286  EXPOSURES = "-3 -2 -1 0 1 2 3".split()
1287  if not FONTS:
1288  FONTS = ANCIENT_GREEK_FONTS
1289  elif lang == "hye":
1290  if not FONTS:
1291  FONTS = ARMENIAN_FONTS
1292  elif lang == "iku":
1293  if not FONTS:
1294  FONTS = NORTH_AMERICAN_ABORIGINAL_FONTS
1295  elif lang == "kat":
1296  if not FONTS:
1297  FONTS = GEORGIAN_FONTS
1298  elif lang == "kat_old":
1299  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/kat.corpus.txt"
1300  if not FONTS:
1301  FONTS = OLD_GEORGIAN_FONTS
1302  elif lang == "kir":
1303  if not FONTS:
1304  FONTS = KYRGYZ_FONTS
1305  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=100"]
1306  elif lang == "kmr":
1307  if not FONTS:
1308  FONTS = LATIN_FONTS
1309  elif lang == "kur_ara":
1310  if not FONTS:
1311  FONTS = KURDISH_FONTS
1312  else:
1313  raise ValueError(f"Error: {lang} is not a valid language code")
1314 
1315  FLAGS_mean_count = int(os.environ.get("FLAGS_mean_count", -1))
1316  if FLAGS_mean_count > 0:
1317  TRAINING_DATA_ARGUMENTS += [f"--mean_count={FLAGS_mean_count}"]
1318  elif not MEAN_COUNT:
1319  TRAINING_DATA_ARGUMENTS += [f"--mean_count={MEAN_COUNT}"]
1320 
1321  # Default to Latin fonts if none have been set
1322  if not FONTS:
1323  FONTS = LATIN_FONTS
1324 
1325  # Default to 0 exposure if it hasn't been set
1326  if not EXPOSURES:
1327  EXPOSURES = [0]
1328  # Set right-to-left and normalization mode.
1329  if lang in (
1330  "ara",
1331  "div",
1332  "fas",
1333  "pus",
1334  "snd",
1335  "syr",
1336  "uig",
1337  "urd",
1338  "kur_ara",
1339  "heb",
1340  "yid",
1341  ):
1342  LANG_IS_RTL = True
1343  NORM_MODE = 2
1344  elif lang in (
1345  "asm",
1346  "ben",
1347  "bih",
1348  "hin",
1349  "mar",
1350  "nep",
1351  "guj",
1352  "kan",
1353  "mal",
1354  "tam",
1355  "tel",
1356  "pan",
1357  "dzo",
1358  "sin",
1359  "san",
1360  "bod",
1361  "ori",
1362  "khm",
1363  "mya",
1364  "tha",
1365  "lao",
1366  "jav ",
1367  "jav_java",
1368  ):
1369  LANG_IS_RTL = False
1370  NORM_MODE = 2
1371  else:
1372  LANG_IS_RTL = False
1373  NORM_MODE = 1
1374 
1375  vars_to_transfer = {
1376  'ambigs_filter_denominator': AMBIGS_FILTER_DENOMINATOR,
1377  'bigram_dawg_factor': BIGRAM_DAWG_FACTOR,
1378  'exposures': EXPOSURES,
1379  'filter_arguments': FILTER_ARGUMENTS,
1380  'fonts': FONTS,
1381  'fragments_disabled': FRAGMENTS_DISABLED,
1382  'generate_word_bigrams': GENERATE_WORD_BIGRAMS,
1383  'lang_is_rtl': LANG_IS_RTL,
1384  'leading': LEADING,
1385  'mean_count': MEAN_COUNT,
1386  'mix_lang': MIX_LANG,
1387  'norm_mode': NORM_MODE,
1388  'number_dawg_factor': NUMBER_DAWG_FACTOR,
1389  'punc_dawg_factor': PUNC_DAWG_FACTOR,
1390  'run_shape_clustering': RUN_SHAPE_CLUSTERING,
1391  'text2image_extra_args': TEXT2IMAGE_EXTRA_ARGS,
1392  'text_corpus': TEXT_CORPUS,
1393  'training_data_arguments': TRAINING_DATA_ARGUMENTS,
1394  'word_dawg_factor': WORD_DAWG_FACTOR,
1395  'word_dawg_size': WORD_DAWG_SIZE,
1396  'wordlist2dawg_arguments': WORDLIST2DAWG_ARGUMENTS,
1397  }
1398 
1399  for attr, value in vars_to_transfer.items():
1400  if hasattr(ctx, attr):
1401  if getattr(ctx, attr) != value:
1402  log.debug(f"{attr} = {value} (was {getattr(ctx, attr)})")
1403  setattr(ctx, attr, value)
1404  else:
1405  log.debug(f"{attr} = {value} (set on cmdline)")
1406  else:
1407  log.debug(f"{attr} = {value}")
1408  setattr(ctx, attr, value)
1409 
1410  return ctx
1411 
1412 # =============================================================================
1413 # END of Language specific info
1414 # =============================================================================

Variable Documentation

◆ AMHARIC_FONTS

list language_specific.AMHARIC_FONTS
Initial value:
1 = [
2  "Abyssinica SIL",
3  "Droid Sans Ethiopic Bold",
4  "Droid Sans Ethiopic",
5  "FreeSerif",
6  "Noto Sans Ethiopic Bold",
7  "Noto Sans Ethiopic",
8 ]

Definition at line 587 of file language_specific.py.

◆ ANCIENT_GREEK_FONTS

list language_specific.ANCIENT_GREEK_FONTS
Initial value:
1 = [
2  "GFS Artemisia",
3  "GFS Artemisia Bold",
4  "GFS Artemisia Bold Italic",
5  "GFS Artemisia Italic",
6  "GFS Bodoni",
7  "GFS Bodoni Bold",
8  "GFS Bodoni Bold Italic",
9  "GFS Bodoni Italic",
10  "GFS Didot",
11  "GFS Didot Bold",
12  "GFS Didot Bold Italic",
13  "GFS Didot Italic",
14  "GFS DidotClassic",
15  "GFS Neohellenic",
16  "GFS Neohellenic Bold",
17  "GFS Neohellenic Bold Italic",
18  "GFS Neohellenic Italic",
19  "GFS Philostratos",
20  "GFS Porson",
21  "GFS Pyrsos",
22  "GFS Solomos",
23 ]

Definition at line 430 of file language_specific.py.

◆ ARABIC_FONTS

list language_specific.ARABIC_FONTS

Definition at line 454 of file language_specific.py.

◆ ARMENIAN_FONTS

list language_specific.ARMENIAN_FONTS
Initial value:
1 = [
2  "Arial Unicode MS",
3  "Arial Unicode MS Bold",
4  "Ascender Uni",
5  "FreeMono",
6  "FreeMono Italic",
7  "FreeSans",
8  "FreeSans Bold",
9  "FreeSans Oblique",
10 ]

Definition at line 596 of file language_specific.py.

◆ BENGALI_FONTS

list language_specific.BENGALI_FONTS
Initial value:
1 = [
2  "Bangla Medium",
3  "Lohit Bengali",
4  "Mukti Narrow",
5  "Mukti Narrow Bold",
6  "Jamrul Medium Semi-Expanded",
7  "Likhan Medium",
8  "Arial Unicode MS Bold",
9  "Ascender Uni",
10  "FreeSans",
11  "FreeSans Oblique",
12  "FreeSerif",
13  "FreeSerif Italic",
14  "Noto Sans Bengali Bold",
15  "Noto Sans Bengali",
16  "Ani",
17  "Lohit Assamese",
18  "Lohit Bengali",
19  "Mitra Mono",
20 ]

Definition at line 518 of file language_specific.py.

◆ BURMESE_FONTS

list language_specific.BURMESE_FONTS
Initial value:
1 = [
2  "Myanmar Sans Pro",
3  "Noto Sans Myanmar Bold",
4  "Noto Sans Myanmar",
5  "Padauk Bold",
6  "Padauk",
7  "TharLon",
8 ]

Definition at line 607 of file language_specific.py.

◆ CHI_SIM_FONTS

list language_specific.CHI_SIM_FONTS
Initial value:
1 = [
2  "AR PL UKai CN",
3  "AR PL UMing Patched Light",
4  "Arial Unicode MS",
5  "Arial Unicode MS Bold",
6  "WenQuanYi Zen Hei Medium",
7 ]

Definition at line 331 of file language_specific.py.

◆ CHI_TRA_FONTS

list language_specific.CHI_TRA_FONTS
Initial value:
1 = [
2  "AR PL UKai TW",
3  "AR PL UMing TW MBE Light",
4  "AR PL UKai Patched",
5  "AR PL UMing Patched Light",
6  "Arial Unicode MS",
7  "Arial Unicode MS Bold",
8  "WenQuanYi Zen Hei Medium",
9 ]

Definition at line 339 of file language_specific.py.

◆ DEVANAGARI_FONTS

list language_specific.DEVANAGARI_FONTS
Initial value:
1 = [
2  "FreeSans",
3  "Chandas",
4  "Kalimati",
5  "Uttara",
6  "Lucida Sans",
7  "gargi Medium",
8  "Lohit Devanagari",
9  "Arial Unicode MS Bold",
10  "Ascender Uni",
11  "Noto Sans Devanagari Bold",
12  "Noto Sans Devanagari",
13  "Samyak Devanagari Medium",
14  "Sarai",
15  "Saral LT Bold",
16  "Saral LT Light",
17  "Nakula",
18  "Sahadeva",
19  "Samanata",
20  "Santipur OT Medium",
21 ]

Definition at line 192 of file language_specific.py.

◆ EARLY_LATIN_FONTS

list language_specific.EARLY_LATIN_FONTS
Initial value:
1 = [
2  *FRAKTUR_FONTS,
3  *LATIN_FONTS,
4  # The Wyld font family renders early modern ligatures encoded in the private
5  # unicode area.
6  "Wyld",
7  "Wyld Italic",
8  # Fonts that render the Yogh symbol (U+021C, U+021D) found in Old English.
9  "GentiumAlt",
10 ]

Definition at line 146 of file language_specific.py.

◆ FLAGS_webtext_prefix

language_specific.FLAGS_webtext_prefix = os.environ.get("FLAGS_webtext_prefix", "")

Definition at line 876 of file language_specific.py.

◆ FRAKTUR_FONTS

list language_specific.FRAKTUR_FONTS
Initial value:
1 = [
2  "CaslonishFraxx Medium",
3  "Cloister Black, Light",
4  "Proclamate Light",
5  "UnifrakturMaguntia",
6  "Walbaum-Fraktur",
7 ]

Definition at line 43 of file language_specific.py.

◆ GEORGIAN_FONTS

list language_specific.GEORGIAN_FONTS

Definition at line 629 of file language_specific.py.

◆ GREEK_FONTS

list language_specific.GREEK_FONTS

Definition at line 397 of file language_specific.py.

◆ GUJARATI_FONTS

list language_specific.GUJARATI_FONTS
Initial value:
1 = [
2  "Lohit Gujarati",
3  "Rekha Medium",
4  "Samyak Gujarati Medium",
5  "aakar Medium",
6  "padmaa Bold",
7  "padmaa Medium",
8  "Arial Unicode MS",
9  "Arial Unicode MS Bold",
10  "Ascender Uni",
11  "FreeSans",
12  "Noto Sans Gujarati Bold",
13  "Noto Sans Gujarati",
14  "Shruti",
15  "Shruti Bold",
16 ]

Definition at line 760 of file language_specific.py.

◆ HEBREW_FONTS

list language_specific.HEBREW_FONTS
Initial value:
1 = [
2  "Arial Bold",
3  "Arial Bold Italic",
4  "Arial Italic",
5  "Arial",
6  "Courier New Bold",
7  "Courier New Bold Italic",
8  "Courier New Italic",
9  "Courier New",
10  "Ergo Hebrew Semi-Bold",
11  "Ergo Hebrew Semi-Bold Italic",
12  "Ergo Hebrew",
13  "Ergo Hebrew Italic",
14  "Really No 2 LT W2G Light",
15  "Really No 2 LT W2G Light Italic",
16  "Really No 2 LT W2G Medium",
17  "Really No 2 LT W2G Medium Italic",
18  "Really No 2 LT W2G Semi-Bold",
19  "Really No 2 LT W2G Semi-Bold Italic",
20  "Really No 2 LT W2G Ultra-Bold",
21  "Really No 2 LT W2G Ultra-Bold Italic",
22  "Times New Roman, Bold",
23  "Times New Roman, Bold Italic",
24  "Times New Roman, Italic",
25  "Times New Roman,",
26  "Lucida Sans",
27  "Tahoma",
28 ]

Definition at line 489 of file language_specific.py.

◆ IRISH_UNCIAL_FONTS

list language_specific.IRISH_UNCIAL_FONTS
Initial value:
1 = [
2  "Bunchlo Arsa Dubh GC",
3  "Bunchlo Arsa GC",
4  "Bunchlo Arsa GC Bold",
5  "Bunchlo Dubh GC",
6  "Bunchlo GC",
7  "Bunchlo GC Bold",
8  "Bunchlo Nua GC Bold",
9  "Bunchló na Nod GC",
10  "Gadelica",
11  "Glanchlo Dubh GC",
12  "Glanchlo GC",
13  "Glanchlo GC Bold",
14  "Seanchló Dubh GC",
15  "Seanchló GC",
16  "Seanchló GC Bold",
17  "Seanchló na Nod GC",
18  "Seanchló Ársa Dubh GC",
19  "Seanchló Ársa GC",
20  "Seanchló Ársa GC Bold",
21  "Tromchlo Beag GC",
22  "Tromchlo Mor GC",
23  "Urchlo GC",
24  "Urchlo GC Bold",
25 ]

Definition at line 120 of file language_specific.py.

◆ JAVANESE_FONTS

list language_specific.JAVANESE_FONTS = ["Prada"]

Definition at line 616 of file language_specific.py.

◆ JPN_FONTS

list language_specific.JPN_FONTS
Initial value:
1 = [
2  "TakaoExGothic",
3  "TakaoExMincho",
4  "TakaoGothic",
5  "TakaoMincho",
6  "TakaoPGothic",
7  "TakaoPMincho",
8  "VL Gothic",
9  "VL PGothic",
10  "Noto Sans Japanese Bold",
11  "Noto Sans Japanese Light",
12 ]

Definition at line 349 of file language_specific.py.

◆ KANNADA_FONTS

list language_specific.KANNADA_FONTS
Initial value:
1 = [
2  "Kedage Bold",
3  "Kedage Italic",
4  "Kedage",
5  "Kedage Bold Italic",
6  "Mallige Bold",
7  "Mallige Italic",
8  "Mallige",
9  "Mallige Bold Italic",
10  "Arial Unicode MS",
11  "Arial Unicode MS Bold",
12  "Ascender Uni",
13  "cheluvi Medium",
14  "Noto Sans Kannada Bold",
15  "Noto Sans Kannada",
16  "Lohit Kannada",
17  "Tunga",
18  "Tunga Bold",
19 ]

Definition at line 214 of file language_specific.py.

◆ KHMER_FONTS

list language_specific.KHMER_FONTS
Initial value:
1 = [
2  "Khmer OS",
3  "Khmer OS System",
4  "Khmer OS Battambang",
5  "Khmer OS Bokor",
6  "Khmer OS Content",
7  "Khmer OS Fasthand",
8  "Khmer OS Freehand",
9  "Khmer OS Metal Chrieng",
10  "Khmer OS Muol Light",
11  "Khmer OS Muol Pali",
12  "Khmer OS Muol",
13  "Khmer OS Siemreap",
14  "Noto Sans Bold",
15  "Noto Sans",
16  "Noto Serif Khmer Bold",
17  "Noto Serif Khmer Light",
18 ]

Definition at line 694 of file language_specific.py.

◆ KOREAN_FONTS

list language_specific.KOREAN_FONTS
Initial value:
1 = [
2  "Arial Unicode MS",
3  "Arial Unicode MS Bold",
4  "Baekmuk Batang Patched",
5  "Baekmuk Batang",
6  "Baekmuk Dotum",
7  "Baekmuk Gulim",
8  "Baekmuk Headline",
9 ]

Definition at line 321 of file language_specific.py.

◆ KURDISH_FONTS

list language_specific.KURDISH_FONTS
Initial value:
1 = [
2  "Amiri Bold Italic",
3  "Amiri Bold",
4  "Amiri Italic",
5  "Amiri",
6  "Arial Unicode MS",
7  "Arial Unicode MS Bold",
8  "Lateef",
9  "Lucida Bright",
10  "Lucida Sans Oblique",
11  "Lucida Sans Semi-Bold",
12  "Lucida Sans",
13  "Lucida Sans Typewriter Bold",
14  "Lucida Sans Typewriter Oblique",
15  "Lucida Sans Typewriter",
16  "Scheherazade",
17  "Tahoma",
18  "Times New Roman,",
19  "Times New Roman, Bold",
20  "Times New Roman, Bold Italic",
21  "Times New Roman, Italic",
22  "Unikurd Web",
23  "Yakout Linotype Bold",
24  "Yakout Linotype",
25 ]

Definition at line 713 of file language_specific.py.

◆ KYRGYZ_FONTS

list language_specific.KYRGYZ_FONTS
Initial value:
1 = [
2  "Arial",
3  "Arial Bold",
4  "Arial Italic",
5  "Arial Bold Italic",
6  "Courier New",
7  "Courier New Bold",
8  "Courier New Italic",
9  "Courier New Bold Italic",
10  "Times New Roman,",
11  "Times New Roman, Bold",
12  "Times New Roman, Bold Italic",
13  "Times New Roman, Italic",
14  "DejaVu Serif",
15  "DejaVu Serif Oblique",
16  "DejaVu Serif Bold",
17  "DejaVu Serif Bold Oblique",
18  "Lucida Bright",
19  "FreeSerif Bold",
20  "FreeSerif Bold Italic",
21 ]

Definition at line 539 of file language_specific.py.

◆ LAOTHIAN_FONTS

list language_specific.LAOTHIAN_FONTS
Initial value:
1 = [
2  "Phetsarath OT",
3  "Arial Unicode MS",
4  "Arial Unicode MS Bold",
5  "Ascender Uni",
6  "Dhyana Bold",
7  "Dhyana",
8  "Lao Muang Don",
9  "Lao Muang Khong",
10  "Lao Sans Pro",
11  "Noto Sans Lao Bold",
12  "Noto Sans Lao",
13  "Noto Sans Lao UI Bold",
14  "Noto Sans Lao UI",
15  "Noto Serif Lao Bold",
16  "Noto Serif Lao",
17  "Phetsarath Bold",
18  "Phetsarath",
19  "Souliyo Unicode",
20 ]

Definition at line 739 of file language_specific.py.

◆ LATIN_FONTS

list language_specific.LATIN_FONTS

Definition at line 52 of file language_specific.py.

◆ log

language_specific.log = logging.getLogger(__name__)

Definition at line 25 of file language_specific.py.

◆ MALAYALAM_FONTS

list language_specific.MALAYALAM_FONTS
Initial value:
1 = [
2  "AnjaliOldLipi",
3  "Arial Unicode MS",
4  "Arial Unicode MS Bold",
5  "Ascender Uni",
6  "Dyuthi",
7  "FreeSerif",
8  "Kalyani",
9  "Kartika",
10  "Kartika Bold",
11  "Lohit Malayalam",
12  "Meera",
13  "Noto Sans Malayalam Bold",
14  "Noto Sans Malayalam",
15  "Rachana",
16  "Rachana_w01",
17  "RaghuMalayalam",
18  "suruma",
19 ]

Definition at line 777 of file language_specific.py.

◆ NEOLATIN_FONTS

list language_specific.NEOLATIN_FONTS

Definition at line 88 of file language_specific.py.

◆ NORTH_AMERICAN_ABORIGINAL_FONTS

list language_specific.NORTH_AMERICAN_ABORIGINAL_FONTS
Initial value:
1 = [
2  "Aboriginal Sans",
3  "Aboriginal Sans Bold Italic",
4  "Aboriginal Sans Italic",
5  "Aboriginal Sans Bold",
6  "Aboriginal Serif Bold",
7  "Aboriginal Serif Bold Italic",
8  "Aboriginal Serif Italic",
9  "Aboriginal Serif",
10 ]

Definition at line 618 of file language_specific.py.

◆ OLD_GEORGIAN_FONTS

list language_specific.OLD_GEORGIAN_FONTS
Initial value:
1 = [
2  "Arial Unicode MS Bold",
3  "Arial Unicode MS",
4  "BPG Algeti GPL\&GNU",
5  "BPG Courier S GPL\&GNU",
6  "BPG DejaVu Sans 2011 GNU-GPL",
7  "BPG Elite GPL\&GNU",
8  "BPG Excelsior GPL\&GNU",
9  "BPG Glaho GPL\&GNU",
10  "BPG Ingiri GPL\&GNU",
11  "BPG Mrgvlovani Caps GNU\&GPL",
12  "BPG Mrgvlovani GPL\&GNU",
13  "BPG Nateli Caps GPL\&GNU Light",
14  "BPG Nateli Condenced GPL\&GNU Light",
15  "BPG Nateli GPL\&GNU Light",
16  "BPG Nino Medium Cond GPL\&GNU",
17  "BPG Nino Medium GPL\&GNU Medium",
18  "BPG Sans GPL\&GNU",
19  "BPG Sans Medium GPL\&GNU",
20  "BPG Sans Modern GPL\&GNU",
21  "BPG Sans Regular GPL\&GNU",
22  "BPG Serif GPL\&GNU",
23  "BPG Serif Modern GPL\&GNU",
24  "FreeSans",
25  "FreeSerif",
26  "FreeSerif Bold",
27  "FreeSerif Bold Italic",
28  "FreeSerif Italic",
29 ]

Definition at line 664 of file language_specific.py.

◆ ORIYA_FONTS

list language_specific.ORIYA_FONTS
Initial value:
1 = [
2  "Arial Unicode MS",
3  "Arial Unicode MS Bold",
4  "Ascender Uni",
5  "ori1Uni Medium",
6  "Samyak Oriya Medium",
7  "Lohit Oriya",
8 ]

Definition at line 797 of file language_specific.py.

◆ PERSIAN_FONTS

list language_specific.PERSIAN_FONTS
Initial value:
1 = [
2  "Amiri Bold Italic",
3  "Amiri Bold",
4  "Amiri Italic",
5  "Amiri",
6  "Andale Sans Arabic Farsi",
7  "Arial Unicode MS",
8  "Arial Unicode MS Bold",
9  "Lateef",
10  "Lucida Bright",
11  "Lucida Sans Oblique",
12  "Lucida Sans Semi-Bold",
13  "Lucida Sans",
14  "Lucida Sans Typewriter Bold",
15  "Lucida Sans Typewriter Oblique",
16  "Lucida Sans Typewriter",
17  "Scheherazade",
18  "Tahoma",
19  "Times New Roman,",
20  "Times New Roman, Bold",
21  "Times New Roman, Bold Italic",
22  "Times New Roman, Italic",
23  "Yakout Linotype Bold",
24  "Yakout Linotype",
25 ]

Definition at line 561 of file language_specific.py.

◆ PUNJABI_FONTS

list language_specific.PUNJABI_FONTS
Initial value:
1 = [
2  "Arial Unicode MS",
3  "Arial Unicode MS Bold",
4  "Ascender Uni",
5  "Saab",
6  "Lohit Punjabi",
7  "Noto Sans Gurmukhi",
8  "Noto Sans Gurmukhi Bold",
9  "FreeSans",
10  "FreeSans Bold",
11  "FreeSerif",
12 ]

Definition at line 806 of file language_specific.py.

◆ RUSSIAN_FONTS

list language_specific.RUSSIAN_FONTS

Definition at line 362 of file language_specific.py.

◆ SINHALA_FONTS

list language_specific.SINHALA_FONTS
Initial value:
1 = [
2  "Noto Sans Sinhala Bold",
3  "Noto Sans Sinhala",
4  "OCRUnicode",
5  "Yagpo",
6  "LKLUG",
7  "FreeSerif",
8 ]

Definition at line 819 of file language_specific.py.

◆ SYRIAC_FONTS

list language_specific.SYRIAC_FONTS
Initial value:
1 = [
2  "East Syriac Adiabene",
3  "East Syriac Ctesiphon",
4  "Estrangelo Antioch",
5  "Estrangelo Edessa",
6  "Estrangelo Midyat",
7  "Estrangelo Nisibin",
8  "Estrangelo Quenneshrin",
9  "Estrangelo Talada",
10  "Estrangelo TurAbdin",
11  "Serto Batnan Bold",
12  "Serto Batnan",
13  "Serto Jerusalem Bold",
14  "Serto Jerusalem Italic",
15  "Serto Jerusalem",
16  "Serto Kharput",
17  "Serto Malankara",
18  "Serto Mardin Bold",
19  "Serto Mardin",
20  "Serto Urhoy Bold",
21  "Serto Urhoy",
22  "FreeSans",
23 ]

Definition at line 828 of file language_specific.py.

◆ TAMIL_FONTS

list language_specific.TAMIL_FONTS
Initial value:
1 = [
2  "TAMu_Kadambri",
3  "TAMu_Kalyani",
4  "TAMu_Maduram",
5  "TSCu_Paranar",
6  "TSCu_Times",
7  "TSCu_Paranar Bold",
8  "FreeSans",
9  "FreeSerif",
10  "Lohit Tamil",
11  "Arial Unicode MS Bold",
12  "Ascender Uni",
13  "Droid Sans Tamil Bold",
14  "Droid Sans Tamil",
15  "Karla Tamil Inclined Bold Italic",
16  "Karla Tamil Inclined Italic",
17  "Karla Tamil Upright Bold",
18  "Karla Tamil Upright",
19  "Noto Sans Tamil Bold",
20  "Noto Sans Tamil",
21  "Noto Sans Tamil UI Bold",
22  "Noto Sans Tamil UI",
23  "TSCu_Comic Normal",
24  "Lohit Tamil Classical",
25 ]

Definition at line 262 of file language_specific.py.

◆ TELUGU_FONTS

list language_specific.TELUGU_FONTS
Initial value:
1 = [
2  "Pothana2000",
3  "Vemana2000",
4  "Lohit Telugu",
5  "Arial Unicode MS Bold",
6  "Ascender Uni",
7  "Dhurjati",
8  "Gautami Bold",
9  "Gidugu",
10  "Gurajada",
11  "Lakki Reddy",
12  "Mallanna",
13  "Mandali",
14  "NATS",
15  "NTR",
16  "Noto Sans Telugu Bold",
17  "Noto Sans Telugu",
18  "Peddana",
19  "Ponnala",
20  "Ramabhadra",
21  "Ravi Prakash",
22  "Sree Krushnadevaraya",
23  "Suranna",
24  "Suravaram",
25  "Tenali Ramakrishna",
26  "Gautami",
27 ]

Definition at line 234 of file language_specific.py.

◆ THAANA_FONTS

list language_specific.THAANA_FONTS = ["FreeSerif"]

Definition at line 852 of file language_specific.py.

◆ THAI_FONTS

list language_specific.THAI_FONTS

Definition at line 288 of file language_specific.py.

◆ TIBETAN_FONTS

list language_specific.TIBETAN_FONTS
Initial value:
1 = [
2  "Arial Unicode MS",
3  "Arial Unicode MS Bold",
4  "Ascender Uni",
5  "DDC Uchen",
6  "Jomolhari",
7  "Kailasa",
8  "Kokonor",
9  "Tibetan Machine Uni",
10  "TibetanTsugRing",
11  "Yagpo",
12 ]

Definition at line 854 of file language_specific.py.

◆ UNUSABLE_LANGUAGE_CODES

string language_specific.UNUSABLE_LANGUAGE_CODES = ""

Definition at line 41 of file language_specific.py.

◆ VALID_LANGUAGE_CODES

tuple language_specific.VALID_LANGUAGE_CODES
Initial value:
1 = (
2  "afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat "
3  "ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo "
4  "ell eng enm epo est eus fas fil fin fra frk frm gle glg "
5  "grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old "
6  "jav jav_java jpn kan kat kat_old kaz khm kir kmr kor kur_ara lao lat "
7  "lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori "
8  "pan pol por pus ron rus san sin slk slv snd spa spa_old "
9  "sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur "
10  "uig ukr urd uzb uzb_cyrl vie yid gle_uncial "
11 )

Definition at line 28 of file language_specific.py.

◆ VERTICAL_FONTS

list language_specific.VERTICAL_FONTS
Initial value:
1 = [
2  "TakaoExGothic",
3  "TakaoExMincho",
4  "AR PL UKai Patched",
5  "AR PL UMing Patched Light",
6  "Baekmuk Batang Patched",
7 ]

Definition at line 868 of file language_specific.py.

◆ VIETNAMESE_FONTS

list language_specific.VIETNAMESE_FONTS

Definition at line 157 of file language_specific.py.

language_specific.set_lang_specific_parameters
def set_lang_specific_parameters(ctx, lang)
Definition: language_specific.py:894
tesstrain_utils.int
int
Definition: tesstrain_utils.py:154