tesseract  5.0.0-alpha-619-ge9db
language_specific.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 # (C) Copyright 2014, Google Inc.
3 # (C) Copyright 2018, James R Barlow
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13 #
14 # Set some language specific variables. Works in conjunction with
15 # tesstrain.sh
16 #
17 
18 # =============================================================================
19 # Language specific info
20 # =============================================================================
21 
22 import logging
23 import os
24 
25 log = logging.getLogger(__name__)
26 
27 # Array of all valid language codes.
28 VALID_LANGUAGE_CODES = (
29  "afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat "
30  "ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo "
31  "ell eng enm epo est eus fas fil fin fra frk frm gle glg "
32  "grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old "
33  "jav jav_java jpn kan kat kat_old kaz khm kir kmr kor kur_ara lao lat "
34  "lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori "
35  "pan pol por pus ron rus san sin slk slv snd spa spa_old "
36  "sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur "
37  "uig ukr urd uzb uzb_cyrl vie yid gle_uncial "
38 )
39 
40 # Codes for which we have webtext but no fonts:
41 UNUSABLE_LANGUAGE_CODES = ""
42 
43 FRAKTUR_FONTS = [
44  "CaslonishFraxx Medium",
45  "Cloister Black, Light",
46  "Proclamate Light",
47  "UnifrakturMaguntia",
48  "Walbaum-Fraktur",
49 ]
50 
51 # List of fonts to train on
52 LATIN_FONTS = [
53  "Arial Bold",
54  "Arial Bold Italic",
55  "Arial Italic",
56  "Arial",
57  "Courier New Bold",
58  "Courier New Bold Italic",
59  "Courier New Italic",
60  "Courier New",
61  "Times New Roman, Bold",
62  "Times New Roman, Bold Italic",
63  "Times New Roman, Italic",
64  "Times New Roman,",
65  "Georgia Bold",
66  "Georgia Italic",
67  "Georgia",
68  "Georgia Bold Italic",
69  "Trebuchet MS Bold",
70  "Trebuchet MS Bold Italic",
71  "Trebuchet MS Italic",
72  "Trebuchet MS",
73  "Verdana Bold",
74  "Verdana Italic",
75  "Verdana",
76  "Verdana Bold Italic",
77  "URW Bookman L Bold",
78  "URW Bookman L Italic",
79  "URW Bookman L Bold Italic",
80  "Century Schoolbook L Bold",
81  "Century Schoolbook L Italic",
82  "Century Schoolbook L Bold Italic",
83  "Century Schoolbook L Medium",
84  "DejaVu Sans Ultra-Light",
85 ]
86 
87 # List of fonts for printed/neo-Latin ('lat' language code, different from Latin script)
88 NEOLATIN_FONTS = [
89  "GFS Bodoni",
90  "GFS Bodoni Bold",
91  "GFS Bodoni Italic",
92  "GFS Bodoni Bold Italic",
93  "GFS Didot",
94  "GFS Didot Bold",
95  "GFS Didot Italic",
96  "GFS Didot Bold Italic",
97  "Cardo",
98  "Cardo Bold",
99  "Cardo Italic",
100  "Wyld",
101  "Wyld Italic",
102  "EB Garamond",
103  "EB Garamond Italic",
104  "Junicode",
105  "Junicode Bold",
106  "Junicode Italic",
107  "Junicode Bold Italic",
108  "IM FELL DW Pica PRO",
109  "IM FELL English PRO",
110  "IM FELL Double Pica PRO",
111  "IM FELL French Canon PRO",
112  "IM FELL Great Primer PRO",
113  "IM FELL DW Pica PRO Italic",
114  "IM FELL English PRO Italic",
115  "IM FELL Double Pica PRO Italic",
116  "IM FELL French Canon PRO Italic",
117  "IM FELL Great Primer PRO Italic",
118 ]
119 
120 IRISH_UNCIAL_FONTS = [
121  "Bunchlo Arsa Dubh GC",
122  "Bunchlo Arsa GC",
123  "Bunchlo Arsa GC Bold",
124  "Bunchlo Dubh GC",
125  "Bunchlo GC",
126  "Bunchlo GC Bold",
127  "Bunchlo Nua GC Bold",
128  "Bunchló na Nod GC",
129  "Gadelica",
130  "Glanchlo Dubh GC",
131  "Glanchlo GC",
132  "Glanchlo GC Bold",
133  "Seanchló Dubh GC",
134  "Seanchló GC",
135  "Seanchló GC Bold",
136  "Seanchló na Nod GC",
137  "Seanchló Ársa Dubh GC",
138  "Seanchló Ársa GC",
139  "Seanchló Ársa GC Bold",
140  "Tromchlo Beag GC",
141  "Tromchlo Mor GC",
142  "Urchlo GC",
143  "Urchlo GC Bold",
144 ]
145 
146 EARLY_LATIN_FONTS = [
147  *FRAKTUR_FONTS,
148  *LATIN_FONTS,
149  # The Wyld font family renders early modern ligatures encoded in the private
150  # unicode area.
151  "Wyld",
152  "Wyld Italic",
153  # Fonts that render the Yogh symbol (U+021C, U+021D) found in Old English.
154  "GentiumAlt",
155 ]
156 
157 VIETNAMESE_FONTS = [
158  "Arial Unicode MS Bold",
159  "Arial Bold Italic",
160  "Arial Italic",
161  "Arial Unicode MS",
162  "FreeMono Bold",
163  "Courier New Bold Italic",
164  "FreeMono Italic",
165  "FreeMono",
166  "GentiumAlt Italic",
167  "GentiumAlt",
168  "Palatino Linotype Bold",
169  "Palatino Linotype Bold Italic",
170  "Palatino Linotype Italic",
171  "Palatino Linotype",
172  "Really No 2 LT W2G Light",
173  "Really No 2 LT W2G Light Italic",
174  "Really No 2 LT W2G Medium",
175  "Really No 2 LT W2G Medium Italic",
176  "Really No 2 LT W2G Semi-Bold",
177  "Really No 2 LT W2G Semi-Bold Italic",
178  "Really No 2 LT W2G Ultra-Bold",
179  "Really No 2 LT W2G Ultra-Bold Italic",
180  "Times New Roman, Bold",
181  "Times New Roman, Bold Italic",
182  "Times New Roman, Italic",
183  "Times New Roman,",
184  "Verdana Bold",
185  "Verdana Italic",
186  "Verdana",
187  "Verdana Bold Italic",
188  "VL Gothic",
189  "VL PGothic",
190 ]
191 
192 DEVANAGARI_FONTS = [
193  "FreeSans",
194  "Chandas",
195  "Kalimati",
196  "Uttara",
197  "Lucida Sans",
198  "gargi Medium",
199  "Lohit Devanagari",
200  "Arial Unicode MS Bold",
201  "Ascender Uni",
202  "Noto Sans Devanagari Bold",
203  "Noto Sans Devanagari",
204  "Samyak Devanagari Medium",
205  "Sarai",
206  "Saral LT Bold",
207  "Saral LT Light",
208  "Nakula",
209  "Sahadeva",
210  "Samanata",
211  "Santipur OT Medium",
212 ]
213 
214 KANNADA_FONTS = [
215  "Kedage Bold",
216  "Kedage Italic",
217  "Kedage",
218  "Kedage Bold Italic",
219  "Mallige Bold",
220  "Mallige Italic",
221  "Mallige",
222  "Mallige Bold Italic",
223  "Arial Unicode MS",
224  "Arial Unicode MS Bold",
225  "Ascender Uni",
226  "cheluvi Medium",
227  "Noto Sans Kannada Bold",
228  "Noto Sans Kannada",
229  "Lohit Kannada",
230  "Tunga",
231  "Tunga Bold",
232 ]
233 
234 TELUGU_FONTS = [
235  "Pothana2000",
236  "Vemana2000",
237  "Lohit Telugu",
238  "Arial Unicode MS Bold",
239  "Ascender Uni",
240  "Dhurjati",
241  "Gautami Bold",
242  "Gidugu",
243  "Gurajada",
244  "Lakki Reddy",
245  "Mallanna",
246  "Mandali",
247  "NATS",
248  "NTR",
249  "Noto Sans Telugu Bold",
250  "Noto Sans Telugu",
251  "Peddana",
252  "Ponnala",
253  "Ramabhadra",
254  "Ravi Prakash",
255  "Sree Krushnadevaraya",
256  "Suranna",
257  "Suravaram",
258  "Tenali Ramakrishna",
259  "Gautami",
260 ]
261 
262 TAMIL_FONTS = [
263  "TAMu_Kadambri",
264  "TAMu_Kalyani",
265  "TAMu_Maduram",
266  "TSCu_Paranar",
267  "TSCu_Times",
268  "TSCu_Paranar Bold",
269  "FreeSans",
270  "FreeSerif",
271  "Lohit Tamil",
272  "Arial Unicode MS Bold",
273  "Ascender Uni",
274  "Droid Sans Tamil Bold",
275  "Droid Sans Tamil",
276  "Karla Tamil Inclined Bold Italic",
277  "Karla Tamil Inclined Italic",
278  "Karla Tamil Upright Bold",
279  "Karla Tamil Upright",
280  "Noto Sans Tamil Bold",
281  "Noto Sans Tamil",
282  "Noto Sans Tamil UI Bold",
283  "Noto Sans Tamil UI",
284  "TSCu_Comic Normal",
285  "Lohit Tamil Classical",
286 ]
287 
288 THAI_FONTS = [
289  "FreeSerif",
290  "FreeSerif Italic",
291  "Garuda",
292  "Norasi",
293  "Lucida Sans Typewriter",
294  "Lucida Sans",
295  "Garuda Oblique",
296  "Norasi Oblique",
297  "Norasi Italic",
298  "Garuda Bold",
299  "Norasi Bold",
300  "Lucida Sans Typewriter Bold",
301  "Lucida Sans Semi-Bold",
302  "Garuda Bold Oblique",
303  "Norasi Bold Italic",
304  "Norasi Bold Oblique",
305  "AnuParp LT Thai",
306  "Arial Unicode MS Bold",
307  "Arial Unicode MS",
308  "Ascender Uni",
309  "Loma",
310  "Noto Serif Thai Bold",
311  "Noto Serif Thai",
312  "Purisa Light",
313  "Sirichana LT Bold",
314  "Sirichana LT",
315  "Sukothai LT Bold",
316  "Sukothai LT",
317  "UtSaHaGumm LT Thai",
318  "Tahoma",
319 ]
320 
321 KOREAN_FONTS = [
322  "Arial Unicode MS",
323  "Arial Unicode MS Bold",
324  "Baekmuk Batang Patched",
325  "Baekmuk Batang",
326  "Baekmuk Dotum",
327  "Baekmuk Gulim",
328  "Baekmuk Headline",
329 ]
330 
331 CHI_SIM_FONTS = [
332  "AR PL UKai CN",
333  "AR PL UMing Patched Light",
334  "Arial Unicode MS",
335  "Arial Unicode MS Bold",
336  "WenQuanYi Zen Hei Medium",
337 ]
338 
339 CHI_TRA_FONTS = [
340  "AR PL UKai TW",
341  "AR PL UMing TW MBE Light",
342  "AR PL UKai Patched",
343  "AR PL UMing Patched Light",
344  "Arial Unicode MS",
345  "Arial Unicode MS Bold",
346  "WenQuanYi Zen Hei Medium",
347 ]
348 
349 JPN_FONTS = [
350  "TakaoExGothic",
351  "TakaoExMincho",
352  "TakaoGothic",
353  "TakaoMincho",
354  "TakaoPGothic",
355  "TakaoPMincho",
356  "VL Gothic",
357  "VL PGothic",
358  "Noto Sans Japanese Bold",
359  "Noto Sans Japanese Light",
360 ]
361 
362 RUSSIAN_FONTS = [
363  "Arial Bold",
364  "Arial Bold Italic",
365  "Arial Italic",
366  "Arial",
367  "Courier New Bold",
368  "Courier New Bold Italic",
369  "Courier New Italic",
370  "Courier New",
371  "Times New Roman, Bold",
372  "Times New Roman, Bold Italic",
373  "Times New Roman, Italic",
374  "Times New Roman,",
375  "Georgia Bold",
376  "Georgia Italic",
377  "Georgia",
378  "Georgia Bold Italic",
379  "Trebuchet MS Bold",
380  "Trebuchet MS Bold Italic",
381  "Trebuchet MS Italic",
382  "Trebuchet MS",
383  "Verdana Bold",
384  "Verdana Italic",
385  "Verdana",
386  "Verdana Bold Italic",
387  "DejaVu Serif",
388  "DejaVu Serif Oblique",
389  "DejaVu Serif Bold",
390  "DejaVu Serif Bold Oblique",
391  "Lucida Bright",
392  "FreeSerif Bold",
393  "FreeSerif Bold Italic",
394  "DejaVu Sans Ultra-Light",
395 ]
396 
397 GREEK_FONTS = [
398  "Arial Unicode MS",
399  "Arial Unicode MS Bold",
400  "DejaVu Sans Mono",
401  "DejaVu Sans Mono Oblique",
402  "DejaVu Sans Mono Bold",
403  "DejaVu Sans Mono Bold Oblique",
404  "DejaVu Serif",
405  "DejaVu Serif Semi-Condensed",
406  "DejaVu Serif Oblique",
407  "DejaVu Serif Bold",
408  "DejaVu Serif Bold Oblique",
409  "DejaVu Serif Bold Semi-Condensed",
410  "FreeSerif Bold",
411  "FreeSerif Bold Italic",
412  "FreeSerif Italic",
413  "FreeSerif",
414  "GentiumAlt",
415  "GentiumAlt Italic",
416  "Linux Biolinum O Bold",
417  "Linux Biolinum O",
418  "Linux Libertine O Bold",
419  "Linux Libertine O",
420  "Linux Libertine O Bold Italic",
421  "Linux Libertine O Italic",
422  "Palatino Linotype Bold",
423  "Palatino Linotype Bold Italic",
424  "Palatino Linotype Italic",
425  "Palatino Linotype",
426  "UmePlus P Gothic",
427  "VL PGothic",
428 ]
429 
430 ANCIENT_GREEK_FONTS = [
431  "GFS Artemisia",
432  "GFS Artemisia Bold",
433  "GFS Artemisia Bold Italic",
434  "GFS Artemisia Italic",
435  "GFS Bodoni",
436  "GFS Bodoni Bold",
437  "GFS Bodoni Bold Italic",
438  "GFS Bodoni Italic",
439  "GFS Didot",
440  "GFS Didot Bold",
441  "GFS Didot Bold Italic",
442  "GFS Didot Italic",
443  "GFS DidotClassic",
444  "GFS Neohellenic",
445  "GFS Neohellenic Bold",
446  "GFS Neohellenic Bold Italic",
447  "GFS Neohellenic Italic",
448  "GFS Philostratos",
449  "GFS Porson",
450  "GFS Pyrsos",
451  "GFS Solomos",
452 ]
453 
454 ARABIC_FONTS = [
455  "Arabic Transparent Bold",
456  "Arabic Transparent",
457  "Arab",
458  "Arial Unicode MS Bold",
459  "Arial Unicode MS",
460  "ASVCodar LT Bold",
461  "ASVCodar LT Light",
462  "Badiya LT Bold",
463  "Badiya LT",
464  "Badr LT Bold",
465  "Badr LT",
466  "Dimnah",
467  "Frutiger LT Arabic Bold",
468  "Frutiger LT Arabic",
469  "Furat",
470  "Hassan LT Bold",
471  "Hassan LT Light",
472  "Jalal LT Bold",
473  "Jalal LT Light",
474  "Midan Bold",
475  "Midan",
476  "Mitra LT Bold",
477  "Mitra LT Light",
478  "Palatino LT Arabic",
479  "Palatino Sans Arabic Bold",
480  "Palatino Sans Arabic",
481  "Simplified Arabic Bold",
482  "Simplified Arabic",
483  "Times New Roman, Bold",
484  "Times New Roman,",
485  "Traditional Arabic Bold",
486  "Traditional Arabic",
487 ]
488 
489 HEBREW_FONTS = [
490  "Arial Bold",
491  "Arial Bold Italic",
492  "Arial Italic",
493  "Arial",
494  "Courier New Bold",
495  "Courier New Bold Italic",
496  "Courier New Italic",
497  "Courier New",
498  "Ergo Hebrew Semi-Bold",
499  "Ergo Hebrew Semi-Bold Italic",
500  "Ergo Hebrew",
501  "Ergo Hebrew Italic",
502  "Really No 2 LT W2G Light",
503  "Really No 2 LT W2G Light Italic",
504  "Really No 2 LT W2G Medium",
505  "Really No 2 LT W2G Medium Italic",
506  "Really No 2 LT W2G Semi-Bold",
507  "Really No 2 LT W2G Semi-Bold Italic",
508  "Really No 2 LT W2G Ultra-Bold",
509  "Really No 2 LT W2G Ultra-Bold Italic",
510  "Times New Roman, Bold",
511  "Times New Roman, Bold Italic",
512  "Times New Roman, Italic",
513  "Times New Roman,",
514  "Lucida Sans",
515  "Tahoma",
516 ]
517 
518 BENGALI_FONTS = [
519  "Bangla Medium",
520  "Lohit Bengali",
521  "Mukti Narrow",
522  "Mukti Narrow Bold",
523  "Jamrul Medium Semi-Expanded",
524  "Likhan Medium",
525  "Arial Unicode MS Bold",
526  "Ascender Uni",
527  "FreeSans",
528  "FreeSans Oblique",
529  "FreeSerif",
530  "FreeSerif Italic",
531  "Noto Sans Bengali Bold",
532  "Noto Sans Bengali",
533  "Ani",
534  "Lohit Assamese",
535  "Lohit Bengali",
536  "Mitra Mono",
537 ]
538 
539 KYRGYZ_FONTS = [
540  "Arial",
541  "Arial Bold",
542  "Arial Italic",
543  "Arial Bold Italic",
544  "Courier New",
545  "Courier New Bold",
546  "Courier New Italic",
547  "Courier New Bold Italic",
548  "Times New Roman,",
549  "Times New Roman, Bold",
550  "Times New Roman, Bold Italic",
551  "Times New Roman, Italic",
552  "DejaVu Serif",
553  "DejaVu Serif Oblique",
554  "DejaVu Serif Bold",
555  "DejaVu Serif Bold Oblique",
556  "Lucida Bright",
557  "FreeSerif Bold",
558  "FreeSerif Bold Italic",
559 ]
560 
561 PERSIAN_FONTS = [
562  "Amiri Bold Italic",
563  "Amiri Bold",
564  "Amiri Italic",
565  "Amiri",
566  "Andale Sans Arabic Farsi",
567  "Arial Unicode MS",
568  "Arial Unicode MS Bold",
569  "Lateef",
570  "Lucida Bright",
571  "Lucida Sans Oblique",
572  "Lucida Sans Semi-Bold",
573  "Lucida Sans",
574  "Lucida Sans Typewriter Bold",
575  "Lucida Sans Typewriter Oblique",
576  "Lucida Sans Typewriter",
577  "Scheherazade",
578  "Tahoma",
579  "Times New Roman,",
580  "Times New Roman, Bold",
581  "Times New Roman, Bold Italic",
582  "Times New Roman, Italic",
583  "Yakout Linotype Bold",
584  "Yakout Linotype",
585 ]
586 
587 AMHARIC_FONTS = [
588  "Abyssinica SIL",
589  "Droid Sans Ethiopic Bold",
590  "Droid Sans Ethiopic",
591  "FreeSerif",
592  "Noto Sans Ethiopic Bold",
593  "Noto Sans Ethiopic",
594 ]
595 
596 ARMENIAN_FONTS = [
597  "Arial Unicode MS",
598  "Arial Unicode MS Bold",
599  "Ascender Uni",
600  "FreeMono",
601  "FreeMono Italic",
602  "FreeSans",
603  "FreeSans Bold",
604  "FreeSans Oblique",
605 ]
606 
607 BURMESE_FONTS = [
608  "Myanmar Sans Pro",
609  "Noto Sans Myanmar Bold",
610  "Noto Sans Myanmar",
611  "Padauk Bold",
612  "Padauk",
613  "TharLon",
614 ]
615 
616 JAVANESE_FONTS = ["Prada"]
617 
618 NORTH_AMERICAN_ABORIGINAL_FONTS = [
619  "Aboriginal Sans",
620  "Aboriginal Sans Bold Italic",
621  "Aboriginal Sans Italic",
622  "Aboriginal Sans Bold",
623  "Aboriginal Serif Bold",
624  "Aboriginal Serif Bold Italic",
625  "Aboriginal Serif Italic",
626  "Aboriginal Serif",
627 ]
628 
629 GEORGIAN_FONTS = [
630  "Arial Unicode MS Bold",
631  "Arial Unicode MS",
632  "BPG Algeti GPL\&GNU",
633  "BPG Chveulebrivi GPL\&GNU",
634  "BPG Courier GPL\&GNU",
635  "BPG Courier S GPL\&GNU",
636  "BPG DejaVu Sans 2011 GNU-GPL",
637  "BPG Elite GPL\&GNU",
638  "BPG Excelsior GPL\&GNU",
639  "BPG Glaho GPL\&GNU",
640  "BPG Gorda GPL\&GNU",
641  "BPG Ingiri GPL\&GNU",
642  "BPG Mrgvlovani Caps GNU\&GPL",
643  "BPG Mrgvlovani GPL\&GNU",
644  "BPG Nateli Caps GPL\&GNU Light",
645  "BPG Nateli Condenced GPL\&GNU Light",
646  "BPG Nateli GPL\&GNU Light",
647  "BPG Nino Medium Cond GPL\&GNU",
648  "BPG Nino Medium GPL\&GNU Medium",
649  "BPG Sans GPL\&GNU",
650  "BPG Sans Medium GPL\&GNU",
651  "BPG Sans Modern GPL\&GNU",
652  "BPG Sans Regular GPL\&GNU",
653  "BPG Serif GPL\&GNU",
654  "BPG Serif Modern GPL\&GNU",
655  "FreeMono",
656  "FreeMono Bold Italic",
657  "FreeSans",
658  "FreeSerif",
659  "FreeSerif Bold",
660  "FreeSerif Bold Italic",
661  "FreeSerif Italic",
662 ]
663 
664 OLD_GEORGIAN_FONTS = [
665  "Arial Unicode MS Bold",
666  "Arial Unicode MS",
667  "BPG Algeti GPL\&GNU",
668  "BPG Courier S GPL\&GNU",
669  "BPG DejaVu Sans 2011 GNU-GPL",
670  "BPG Elite GPL\&GNU",
671  "BPG Excelsior GPL\&GNU",
672  "BPG Glaho GPL\&GNU",
673  "BPG Ingiri GPL\&GNU",
674  "BPG Mrgvlovani Caps GNU\&GPL",
675  "BPG Mrgvlovani GPL\&GNU",
676  "BPG Nateli Caps GPL\&GNU Light",
677  "BPG Nateli Condenced GPL\&GNU Light",
678  "BPG Nateli GPL\&GNU Light",
679  "BPG Nino Medium Cond GPL\&GNU",
680  "BPG Nino Medium GPL\&GNU Medium",
681  "BPG Sans GPL\&GNU",
682  "BPG Sans Medium GPL\&GNU",
683  "BPG Sans Modern GPL\&GNU",
684  "BPG Sans Regular GPL\&GNU",
685  "BPG Serif GPL\&GNU",
686  "BPG Serif Modern GPL\&GNU",
687  "FreeSans",
688  "FreeSerif",
689  "FreeSerif Bold",
690  "FreeSerif Bold Italic",
691  "FreeSerif Italic",
692 ]
693 
694 KHMER_FONTS = [
695  "Khmer OS",
696  "Khmer OS System",
697  "Khmer OS Battambang",
698  "Khmer OS Bokor",
699  "Khmer OS Content",
700  "Khmer OS Fasthand",
701  "Khmer OS Freehand",
702  "Khmer OS Metal Chrieng",
703  "Khmer OS Muol Light",
704  "Khmer OS Muol Pali",
705  "Khmer OS Muol",
706  "Khmer OS Siemreap",
707  "Noto Sans Bold",
708  "Noto Sans",
709  "Noto Serif Khmer Bold",
710  "Noto Serif Khmer Light",
711 ]
712 
713 KURDISH_FONTS = [
714  "Amiri Bold Italic",
715  "Amiri Bold",
716  "Amiri Italic",
717  "Amiri",
718  "Arial Unicode MS",
719  "Arial Unicode MS Bold",
720  "Lateef",
721  "Lucida Bright",
722  "Lucida Sans Oblique",
723  "Lucida Sans Semi-Bold",
724  "Lucida Sans",
725  "Lucida Sans Typewriter Bold",
726  "Lucida Sans Typewriter Oblique",
727  "Lucida Sans Typewriter",
728  "Scheherazade",
729  "Tahoma",
730  "Times New Roman,",
731  "Times New Roman, Bold",
732  "Times New Roman, Bold Italic",
733  "Times New Roman, Italic",
734  "Unikurd Web",
735  "Yakout Linotype Bold",
736  "Yakout Linotype",
737 ]
738 
739 LAOTHIAN_FONTS = [
740  "Phetsarath OT",
741  "Arial Unicode MS",
742  "Arial Unicode MS Bold",
743  "Ascender Uni",
744  "Dhyana Bold",
745  "Dhyana",
746  "Lao Muang Don",
747  "Lao Muang Khong",
748  "Lao Sans Pro",
749  "Noto Sans Lao Bold",
750  "Noto Sans Lao",
751  "Noto Sans Lao UI Bold",
752  "Noto Sans Lao UI",
753  "Noto Serif Lao Bold",
754  "Noto Serif Lao",
755  "Phetsarath Bold",
756  "Phetsarath",
757  "Souliyo Unicode",
758 ]
759 
760 GUJARATI_FONTS = [
761  "Lohit Gujarati",
762  "Rekha Medium",
763  "Samyak Gujarati Medium",
764  "aakar Medium",
765  "padmaa Bold",
766  "padmaa Medium",
767  "Arial Unicode MS",
768  "Arial Unicode MS Bold",
769  "Ascender Uni",
770  "FreeSans",
771  "Noto Sans Gujarati Bold",
772  "Noto Sans Gujarati",
773  "Shruti",
774  "Shruti Bold",
775 ]
776 
777 MALAYALAM_FONTS = [
778  "AnjaliOldLipi",
779  "Arial Unicode MS",
780  "Arial Unicode MS Bold",
781  "Ascender Uni",
782  "Dyuthi",
783  "FreeSerif",
784  "Kalyani",
785  "Kartika",
786  "Kartika Bold",
787  "Lohit Malayalam",
788  "Meera",
789  "Noto Sans Malayalam Bold",
790  "Noto Sans Malayalam",
791  "Rachana",
792  "Rachana_w01",
793  "RaghuMalayalam",
794  "suruma",
795 ]
796 
797 ORIYA_FONTS = [
798  "Arial Unicode MS",
799  "Arial Unicode MS Bold",
800  "Ascender Uni",
801  "ori1Uni Medium",
802  "Samyak Oriya Medium",
803  "Lohit Oriya",
804 ]
805 
806 PUNJABI_FONTS = [
807  "Arial Unicode MS",
808  "Arial Unicode MS Bold",
809  "Ascender Uni",
810  "Saab",
811  "Lohit Punjabi",
812  "Noto Sans Gurmukhi",
813  "Noto Sans Gurmukhi Bold",
814  "FreeSans",
815  "FreeSans Bold",
816  "FreeSerif",
817 ]
818 
819 SINHALA_FONTS = [
820  "Noto Sans Sinhala Bold",
821  "Noto Sans Sinhala",
822  "OCRUnicode",
823  "Yagpo",
824  "LKLUG",
825  "FreeSerif",
826 ]
827 
828 SYRIAC_FONTS = [
829  "East Syriac Adiabene",
830  "East Syriac Ctesiphon",
831  "Estrangelo Antioch",
832  "Estrangelo Edessa",
833  "Estrangelo Midyat",
834  "Estrangelo Nisibin",
835  "Estrangelo Quenneshrin",
836  "Estrangelo Talada",
837  "Estrangelo TurAbdin",
838  "Serto Batnan Bold",
839  "Serto Batnan",
840  "Serto Jerusalem Bold",
841  "Serto Jerusalem Italic",
842  "Serto Jerusalem",
843  "Serto Kharput",
844  "Serto Malankara",
845  "Serto Mardin Bold",
846  "Serto Mardin",
847  "Serto Urhoy Bold",
848  "Serto Urhoy",
849  "FreeSans",
850 ]
851 
852 THAANA_FONTS = ["FreeSerif"]
853 
854 TIBETAN_FONTS = [
855  "Arial Unicode MS",
856  "Arial Unicode MS Bold",
857  "Ascender Uni",
858  "DDC Uchen",
859  "Jomolhari",
860  "Kailasa",
861  "Kokonor",
862  "Tibetan Machine Uni",
863  "TibetanTsugRing",
864  "Yagpo",
865 ]
866 
867 # The following fonts will be rendered vertically in phase I.
868 VERTICAL_FONTS = [
869  "TakaoExGothic",
870  "TakaoExMincho",
871  "AR PL UKai Patched",
872  "AR PL UMing Patched Light",
873  "Baekmuk Batang Patched",
874 ]
875 
876 FLAGS_webtext_prefix = os.environ.get("FLAGS_webtext_prefix", "")
877 
878 
879 # Set language-specific values for several global variables, including
880 # ${TEXT_CORPUS}
881 # holds the text corpus file for the language, used in phase F
882 # ${FONTS[@]}
883 # holds a sequence of applicable fonts for the language, used in
884 # phase F & I. only set if not already set, i.e. from command line
885 # ${TRAINING_DATA_ARGUMENTS}
886 # non-default arguments to the training_data program used in phase T
887 # ${FILTER_ARGUMENTS}[ -]
888 # character-code-specific filtering to distinguish between scripts
889 # (eg. CJK) used by filter_borbidden_characters in phase F
890 # ${WORDLIST2DAWG_ARGUMENTS}
891 # specify fixed length dawg generation for non-space-delimited lang
892 # TODO(dsl): We can refactor these into functions that assign FONTS,
893 # TEXT_CORPUS, etc. separately.
895  # The default text location is now given directly from the language code.
896  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/{lang}.corpus.txt"
897  FILTER_ARGUMENTS = []
898  WORDLIST2DAWG_ARGUMENTS = ""
899  # These dawg factors represent the fraction of the corpus not covered by the
900  # dawg, and seem like reasonable defaults, but the optimal value is likely
901  # to be highly corpus-dependent, as well as somewhat language-dependent.
902  # Number dawg factor is the fraction of all numeric strings that are not
903  # covered, which is why it is higher relative to the others.
904  PUNC_DAWG_FACTOR = None
905  NUMBER_DAWG_FACTOR = 0.125
906  WORD_DAWG_FACTOR = 0.05
907  BIGRAM_DAWG_FACTOR = 0.015
908  TRAINING_DATA_ARGUMENTS = []
909  FRAGMENTS_DISABLED = "y"
910  RUN_SHAPE_CLUSTERING = False
911  AMBIGS_FILTER_DENOMINATOR = "100000"
912  LEADING = 32
913  MEAN_COUNT = 40 # Default for latin script.
914  # Language to mix with the language for maximum accuracy. Defaults to eng.
915  # If no language is good, set to the base language.
916  MIX_LANG = "eng"
917  FONTS = ctx.fonts
918  TEXT2IMAGE_EXTRA_ARGS = []
919  EXPOSURES = []
920 
921  GENERATE_WORD_BIGRAMS = None
922  WORD_DAWG_SIZE = None
923 
924  # Latin languages.
925  if lang == "enm":
926  TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported
927  if not FONTS:
928  FONTS = EARLY_LATIN_FONTS
929  elif lang == "frm":
930  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/fra.corpus.txt"
931  # Make long-s substitutions for Middle French text
932  FILTER_ARGUMENTS += ["--make_early_language_variant=fra"]
933  TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported.
934  if not FONTS:
935  FONTS = EARLY_LATIN_FONTS
936  elif lang == "frk":
937  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/deu.corpus.txt"
938  if not FONTS:
939  FONTS = FRAKTUR_FONTS
940  elif lang == "ita_old":
941  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/ita.corpus.txt"
942  # Make long-s substitutions for Early Italian text
943  FILTER_ARGUMENTS += ["--make_early_language_variant=ita"]
944  TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported.
945  if not FONTS:
946  FONTS = EARLY_LATIN_FONTS
947  elif lang == "lat":
948  if not EXPOSURES:
949  EXPOSURES = "-3 -2 -1 0 1 2 3".split()
950  if not FONTS:
951  FONTS = NEOLATIN_FONTS
952  elif lang == "spa_old":
953  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/spa.corpus.txt"
954  # Make long-s substitutions for Early Spanish text
955  FILTER_ARGUMENTS += ["--make_early_language_variant=spa"]
956  TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported.
957  if not FONTS:
958  FONTS = EARLY_LATIN_FONTS
959  elif lang == "srp_latn":
960  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/srp.corpus.txt"
961  elif lang == "vie":
962  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
963  if not FONTS:
964  FONTS = VIETNAMESE_FONTS
965  # Highly inflective languages get a bigger dawg size.
966  # TODO(rays) Add more here!
967  elif lang == "hun":
968  WORD_DAWG_SIZE = 1_000_000
969  elif lang == "pol":
970  WORD_DAWG_SIZE = 1_000_000
971 
972  # Latin with default treatment.
973  elif lang == "afr":
974  pass
975  elif lang == "aze":
976  pass
977  elif lang == "bos":
978  pass
979  elif lang == "cat":
980  pass
981  elif lang == "ceb":
982  pass
983  elif lang == "ces":
984  PUNC_DAWG_FACTOR = 0.004
985  elif lang == "cym":
986  pass
987  elif lang == "dan":
988  pass
989  elif lang == "deu":
990  WORD_DAWG_FACTOR = 0.125
991  elif lang == "eng":
992  WORD_DAWG_FACTOR = 0.03
993  elif lang == "epo":
994  pass
995  elif lang == "est":
996  pass
997  elif lang == "eus":
998  pass
999  elif lang == "fil":
1000  pass
1001  elif lang == "fin":
1002  pass
1003  elif lang == "fra":
1004  WORD_DAWG_FACTOR = 0.08
1005  elif lang == "gle":
1006  pass
1007  elif lang == "gle_uncial":
1008  if not FONTS:
1009  FONTS = IRISH_UNCIAL_FONTS
1010  elif lang == "glg":
1011  pass
1012  elif lang == "hat":
1013  pass
1014  elif lang == "hrv":
1015  pass
1016  elif lang == "iast":
1017  pass
1018  elif lang == "ind":
1019  pass
1020  elif lang == "isl":
1021  pass
1022  elif lang == "ita":
1023  pass
1024  elif lang == "jav":
1025  pass
1026  elif lang == "lav":
1027  pass
1028  elif lang == "lit":
1029  pass
1030  elif lang == "mlt":
1031  pass
1032  elif lang == "msa":
1033  pass
1034  elif lang == "nld":
1035  WORD_DAWG_FACTOR = 0.02
1036  elif lang == "nor":
1037  pass
1038  elif lang == "por":
1039  pass
1040  elif lang == "ron":
1041  pass
1042  elif lang == "slk":
1043  pass
1044  elif lang == "slv":
1045  pass
1046  elif lang == "spa":
1047  pass
1048  elif lang == "sqi":
1049  pass
1050  elif lang == "swa":
1051  pass
1052  elif lang == "swe":
1053  pass
1054  elif lang == "tgl":
1055  pass
1056  elif lang == "tur":
1057  pass
1058  elif lang == "uzb":
1059  pass
1060  elif lang == "zlm":
1061  pass
1062 
1063  # Special code for performing language-id that is trained on
1064  # EFIGS+Latin+Vietnamese text with regular + fraktur fonts.
1065  elif lang == "lat_lid":
1066  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/lat_lid.corpus.txt"
1067  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1068  GENERATE_WORD_BIGRAMS = 0
1069  # Strip unrenderable words as not all fonts will render the extended
1070  # latin symbols found in Vietnamese text.
1071  WORD_DAWG_SIZE = 1_000_000
1072  if not FONTS:
1073  FONTS = EARLY_LATIN_FONTS
1074 
1075  # Cyrillic script-based languages. It is bad to mix Latin with Cyrillic.
1076  elif lang == "rus":
1077  if not FONTS:
1078  FONTS = RUSSIAN_FONTS
1079  MIX_LANG = "rus"
1080  NUMBER_DAWG_FACTOR = 0.05
1081  WORD_DAWG_SIZE = 1_000_000
1082  elif lang in (
1083  "aze_cyrl",
1084  "bel",
1085  "bul",
1086  "kaz",
1087  "mkd",
1088  "srp",
1089  "tgk",
1090  "ukr",
1091  "uzb_cyrl",
1092  ):
1093  MIX_LANG = f"{lang}"
1094  if not FONTS:
1095  FONTS = RUSSIAN_FONTS
1096 
1097  # Special code for performing Cyrillic language-id that is trained on
1098  # Russian, Serbian, Ukrainian, Belarusian, Macedonian, Tajik and Mongolian
1099  # text with the list of Russian fonts.
1100  elif lang == "cyr_lid":
1101  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/cyr_lid.corpus.txt"
1102  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1103  GENERATE_WORD_BIGRAMS = 0
1104  WORD_DAWG_SIZE = 1_000_000
1105  if not FONTS:
1106  FONTS = RUSSIAN_FONTS
1107 
1108  # South Asian scripts mostly have a lot of different graphemes, so trim
1109  # down the MEAN_COUNT so as not to get a huge amount of text.
1110  elif lang in ("asm", "ben"):
1111  MEAN_COUNT = 15
1112  WORD_DAWG_FACTOR = 0.15
1113  if not FONTS:
1114  FONTS = BENGALI_FONTS
1115  elif lang in ("bih", "hin", "mar", "nep", "san"):
1116  MEAN_COUNT = 15
1117  WORD_DAWG_FACTOR = 0.15
1118  if not FONTS:
1119  FONTS = DEVANAGARI_FONTS
1120  elif lang == "bod":
1121  MEAN_COUNT = 15
1122  WORD_DAWG_FACTOR = 0.15
1123  if not FONTS:
1124  FONTS = TIBETAN_FONTS
1125  elif lang == "dzo":
1126  WORD_DAWG_FACTOR = 0.01
1127  if not FONTS:
1128  FONTS = TIBETAN_FONTS
1129  elif lang == "guj":
1130  MEAN_COUNT = 15
1131  WORD_DAWG_FACTOR = 0.15
1132  if not FONTS:
1133  FONTS = GUJARATI_FONTS
1134  elif lang == "kan":
1135  MEAN_COUNT = 15
1136  WORD_DAWG_FACTOR = 0.15
1137  TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1138  TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1139  if not FONTS:
1140  FONTS = KANNADA_FONTS
1141  elif lang == "mal":
1142  MEAN_COUNT = 15
1143  WORD_DAWG_FACTOR = 0.15
1144  TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1145  TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1146  if not FONTS:
1147  FONTS = MALAYALAM_FONTS
1148  elif lang == "ori":
1149  WORD_DAWG_FACTOR = 0.01
1150  if not FONTS:
1151  FONTS = ORIYA_FONTS
1152  elif lang == "pan":
1153  MEAN_COUNT = 15
1154  WORD_DAWG_FACTOR = 0.01
1155  if not FONTS:
1156  FONTS = PUNJABI_FONTS
1157  elif lang == "sin":
1158  MEAN_COUNT = 15
1159  WORD_DAWG_FACTOR = 0.01
1160  if not FONTS:
1161  FONTS = SINHALA_FONTS
1162  elif lang == "tam":
1163  MEAN_COUNT = 30
1164  WORD_DAWG_FACTOR = 0.15
1165  TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1166  TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1167  if not FONTS:
1168  FONTS = TAMIL_FONTS
1169  elif lang == "tel":
1170  MEAN_COUNT = 15
1171  WORD_DAWG_FACTOR = 0.15
1172  TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"]
1173  TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"]
1174  if not FONTS:
1175  FONTS = TELUGU_FONTS
1176 
1177  # SouthEast Asian scripts.
1178  elif lang == "jav_java":
1179  MEAN_COUNT = 15
1180  WORD_DAWG_FACTOR = 0.15
1181  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1182  if not FONTS:
1183  FONTS = JAVANESE_FONTS
1184  elif lang == "khm":
1185  MEAN_COUNT = 15
1186  WORD_DAWG_FACTOR = 0.15
1187  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1188  if not FONTS:
1189  FONTS = KHMER_FONTS
1190  elif lang == "lao":
1191  MEAN_COUNT = 15
1192  WORD_DAWG_FACTOR = 0.15
1193  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1194  if not FONTS:
1195  FONTS = LAOTHIAN_FONTS
1196  elif lang == "mya":
1197  MEAN_COUNT = 12
1198  WORD_DAWG_FACTOR = 0.15
1199  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1200  if not FONTS:
1201  FONTS = BURMESE_FONTS
1202  elif lang == "tha":
1203  MEAN_COUNT = 30
1204  WORD_DAWG_FACTOR = 0.01
1205  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1206  FILTER_ARGUMENTS += ["--segmenter_lang=tha"]
1207  TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1208  AMBIGS_FILTER_DENOMINATOR = "1000"
1209  LEADING = 48
1210  if not FONTS:
1211  FONTS = THAI_FONTS
1212 
1213  # CJK
1214  elif lang == "chi_sim":
1215  MEAN_COUNT = 15
1216  PUNC_DAWG_FACTOR = 0.015
1217  WORD_DAWG_FACTOR = 0.015
1218  GENERATE_WORD_BIGRAMS = 0
1219  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1220  TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1221  FILTER_ARGUMENTS += ["--charset_filter=chi_sim", "--segmenter_lang=chi_sim"]
1222  if not FONTS:
1223  FONTS = CHI_SIM_FONTS
1224  elif lang == "chi_tra":
1225  MEAN_COUNT = 15
1226  WORD_DAWG_FACTOR = 0.015
1227  GENERATE_WORD_BIGRAMS = 0
1228  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1229  TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1230  FILTER_ARGUMENTS += ["--charset_filter=chi_tr", "--segmenter_lang=chi_tra"]
1231  if not FONTS:
1232  FONTS = CHI_TRA_FONTS
1233  elif lang == "jpn":
1234  MEAN_COUNT = 15
1235  WORD_DAWG_FACTOR = 0.015
1236  GENERATE_WORD_BIGRAMS = 0
1237  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1238  TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="]
1239  FILTER_ARGUMENTS += ["--charset_filter=jpn", "--segmenter_lang=jpn"]
1240  if not FONTS:
1241  FONTS = JPN_FONTS
1242  elif lang == "kor":
1243  MEAN_COUNT = 20
1244  WORD_DAWG_FACTOR = 0.015
1245  NUMBER_DAWG_FACTOR = 0.05
1246  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"]
1247  TRAINING_DATA_ARGUMENTS += ["--desired_bigrams="]
1248  GENERATE_WORD_BIGRAMS = 0
1249  FILTER_ARGUMENTS += ["--charset_filter=kor", "--segmenter_lang=kor"]
1250  if not FONTS:
1251  FONTS = KOREAN_FONTS
1252 
1253  # Middle-Eastern scripts.
1254  elif lang == "ara":
1255  if not FONTS:
1256  FONTS = ARABIC_FONTS
1257  elif lang == "div":
1258  if not FONTS:
1259  FONTS = THAANA_FONTS
1260  elif lang in ("fas", "pus", "snd", "uig", "urd"):
1261  if not FONTS:
1262  FONTS = PERSIAN_FONTS
1263  elif lang in ("heb", "yid"):
1264  NUMBER_DAWG_FACTOR = 0.05
1265  WORD_DAWG_FACTOR = 0.08
1266  if not FONTS:
1267  FONTS = HEBREW_FONTS
1268  elif lang == "syr":
1269  if not FONTS:
1270  FONTS = SYRIAC_FONTS
1271 
1272  # Other scripts.
1273  elif lang in ("amh", "tir"):
1274  if not FONTS:
1275  FONTS = AMHARIC_FONTS
1276  elif lang == "chr":
1277  if not FONTS:
1278  FONTS = [*NORTH_AMERICAN_ABORIGINAL_FONTS, "Noto Sans Cherokee"]
1279  elif lang == "ell":
1280  NUMBER_DAWG_FACTOR = 0.05
1281  WORD_DAWG_FACTOR = 0.08
1282  if not FONTS:
1283  FONTS = GREEK_FONTS
1284  elif lang == "grc":
1285  if not EXPOSURES:
1286  EXPOSURES = "-3 -2 -1 0 1 2 3".split()
1287  if not FONTS:
1288  FONTS = ANCIENT_GREEK_FONTS
1289  elif lang == "hye":
1290  if not FONTS:
1291  FONTS = ARMENIAN_FONTS
1292  elif lang == "iku":
1293  if not FONTS:
1294  FONTS = NORTH_AMERICAN_ABORIGINAL_FONTS
1295  elif lang == "kat":
1296  if not FONTS:
1297  FONTS = GEORGIAN_FONTS
1298  elif lang == "kat_old":
1299  TEXT_CORPUS = f"{FLAGS_webtext_prefix}/kat.corpus.txt"
1300  if not FONTS:
1301  FONTS = OLD_GEORGIAN_FONTS
1302  elif lang == "kir":
1303  if not FONTS:
1304  FONTS = KYRGYZ_FONTS
1305  TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=100"]
1306  elif lang == "kmr":
1307  if not FONTS:
1308  FONTS = LATIN_FONTS
1309  elif lang == "kur_ara":
1310  if not FONTS:
1311  FONTS = KURDISH_FONTS
1312  else:
1313  raise ValueError(f"Error: {lang} is not a valid language code")
1314 
1315  FLAGS_mean_count = int(os.environ.get("FLAGS_mean_count", -1))
1316  if FLAGS_mean_count > 0:
1317  TRAINING_DATA_ARGUMENTS += [f"--mean_count={FLAGS_mean_count}"]
1318  elif not MEAN_COUNT:
1319  TRAINING_DATA_ARGUMENTS += [f"--mean_count={MEAN_COUNT}"]
1320 
1321  # Default to Latin fonts if none have been set
1322  if not FONTS:
1323  FONTS = LATIN_FONTS
1324 
1325  # Default to 0 exposure if it hasn't been set
1326  if not EXPOSURES:
1327  EXPOSURES = [0]
1328  # Set right-to-left and normalization mode.
1329  if lang in (
1330  "ara",
1331  "div",
1332  "fas",
1333  "pus",
1334  "snd",
1335  "syr",
1336  "uig",
1337  "urd",
1338  "kur_ara",
1339  "heb",
1340  "yid",
1341  ):
1342  LANG_IS_RTL = True
1343  NORM_MODE = 2
1344  elif lang in (
1345  "asm",
1346  "ben",
1347  "bih",
1348  "hin",
1349  "mar",
1350  "nep",
1351  "guj",
1352  "kan",
1353  "mal",
1354  "tam",
1355  "tel",
1356  "pan",
1357  "dzo",
1358  "sin",
1359  "san",
1360  "bod",
1361  "ori",
1362  "khm",
1363  "mya",
1364  "tha",
1365  "lao",
1366  "jav ",
1367  "jav_java",
1368  ):
1369  LANG_IS_RTL = False
1370  NORM_MODE = 2
1371  else:
1372  LANG_IS_RTL = False
1373  NORM_MODE = 1
1374 
1375  vars_to_transfer = {
1376  'ambigs_filter_denominator': AMBIGS_FILTER_DENOMINATOR,
1377  'bigram_dawg_factor': BIGRAM_DAWG_FACTOR,
1378  'exposures': EXPOSURES,
1379  'filter_arguments': FILTER_ARGUMENTS,
1380  'fonts': FONTS,
1381  'fragments_disabled': FRAGMENTS_DISABLED,
1382  'generate_word_bigrams': GENERATE_WORD_BIGRAMS,
1383  'lang_is_rtl': LANG_IS_RTL,
1384  'leading': LEADING,
1385  'mean_count': MEAN_COUNT,
1386  'mix_lang': MIX_LANG,
1387  'norm_mode': NORM_MODE,
1388  'number_dawg_factor': NUMBER_DAWG_FACTOR,
1389  'punc_dawg_factor': PUNC_DAWG_FACTOR,
1390  'run_shape_clustering': RUN_SHAPE_CLUSTERING,
1391  'text2image_extra_args': TEXT2IMAGE_EXTRA_ARGS,
1392  'text_corpus': TEXT_CORPUS,
1393  'training_data_arguments': TRAINING_DATA_ARGUMENTS,
1394  'word_dawg_factor': WORD_DAWG_FACTOR,
1395  'word_dawg_size': WORD_DAWG_SIZE,
1396  'wordlist2dawg_arguments': WORDLIST2DAWG_ARGUMENTS,
1397  }
1398 
1399  for attr, value in vars_to_transfer.items():
1400  if hasattr(ctx, attr):
1401  if getattr(ctx, attr) != value:
1402  log.debug(f"{attr} = {value} (was {getattr(ctx, attr)})")
1403  setattr(ctx, attr, value)
1404  else:
1405  log.debug(f"{attr} = {value} (set on cmdline)")
1406  else:
1407  log.debug(f"{attr} = {value}")
1408  setattr(ctx, attr, value)
1409 
1410  return ctx
1411 
1412 # =============================================================================
1413 # END of Language specific info
1414 # =============================================================================
language_specific.set_lang_specific_parameters
def set_lang_specific_parameters(ctx, lang)
Definition: language_specific.py:894
tesstrain_utils.int
int
Definition: tesstrain_utils.py:154