19 import concurrent.futures
27 from datetime
import date
28 from operator
import itemgetter
29 from tempfile
import TemporaryDirectory, mkdtemp
33 from language_specific
import VERTICAL_FONTS
35 log = logging.getLogger(__name__)
41 self.
uname = platform.uname().system.lower()
48 "/Library/Fonts/" if "darwin" in self.
uname else "/usr/share/fonts/"
60 return (argparse.Namespace.__eq__(self, other)
and
61 self.
uname == other.uname
and self.
lang_code == other.lang_code
and
79 for d
in (
"",
"api/",
"training/"):
80 testcmd = shutil.which(f
"{d}{cmd}")
81 if shutil.which(testcmd):
84 if not shutil.which(cmd):
87 log.debug(f
"Running {cmd}")
89 for idx, arg
in enumerate(args):
93 if isinstance(arg, pathlib.WindowsPath):
96 proc = subprocess.run(
97 [cmd, *args], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env
99 proclog = logging.getLogger(cmd)
100 if proc.returncode == 0:
101 proclog.debug(proc.stdout.decode(
"utf-8", errors=
"replace"))
104 proclog.error(proc.stdout.decode(
"utf-8", errors=
"replace"))
105 except Exception
as e:
107 err_exit(f
"Program {cmd} failed with return code {proc.returncode}. Abort.")
114 if isinstance(filenames, (str, pathlib.Path)):
115 filenames = [filenames]
116 for filename
in filenames:
118 with pathlib.Path(filename).open():
120 except FileNotFoundError:
121 err_exit(f
"Required/expected file '{filename}' does not exist")
122 except PermissionError:
123 err_exit(f
"{filename} is not readable")
125 err_exit(f
"{filename} IO Error: {str(e)}")
129 parser = argparse.ArgumentParser(
131 The font names specified in --fontlist need to be recognizable by Pango using
132 fontconfig. An easy way to list the canonical names of all fonts available on
133 your system is to run text2image with --list_available_fonts and the
134 appropriate --fonts_dir path.
142 help=
"A list of fontnames to train on.",
144 parser.add_argument(
"--fonts_dir", help=
"Path to font files.")
145 parser.add_argument(
"--tmp_dir", help=
"Path to temporary training directory.")
147 "--lang", metavar=
"LANG_CODE", dest=
"lang_code", help=
"ISO 639 code."
152 help=
"Path to tesseract/training/langdata directory.",
154 parser.add_argument(
"--maxpages", type=int, dest=
"max_pages")
156 "--output_dir", metavar=
"OUTPUTDIR", help=
"Location of output traineddata file."
159 "--overwrite", action=
"store_true", help=
"Safe to overwrite files in output_dir."
164 help=
"Save box/tiff pairs along with lstmf files.",
170 help=
"Only generate training data for lstmtraining.",
173 inputdata_group = parser.add_argument_group(
175 "OPTIONAL flags for input data. If unspecified we will look for them in the langdata_dir directory.",
177 inputdata_group.add_argument(
178 "--training_text", metavar=
"TEXTFILE", help=
"Text to render and use for training."
180 inputdata_group.add_argument(
182 dest=
"wordlist_file",
184 help=
"Word list for the language ordered by decreasing frequency.",
187 parser.add_argument(
"--extract_font_properties", action=
"store_true")
189 "--noextract_font_properties", dest=
"extract_font_properties", action=
"store_false"
193 "--distort_image", dest=
"distort_image", action=
"store_true"
196 tessdata_group = parser.add_argument_group(
198 "OPTIONAL flag to specify location of existing traineddata files, required during feature extraction. If unspecified will use TESSDATA_PREFIX defined in the current environment.",
200 tessdata_group.add_argument(
202 metavar=
"TESSDATADIR",
203 help=
"Path to tesseract/tessdata directory.",
211 help=
"A list of exposure levels to use (e.g. -1,0,1).",
219 parser.parse_args(args=argv, namespace=ctx)
222 if not ctx.lang_code:
223 err_exit(
"Need to specify a language --lang")
224 if not ctx.langdata_dir:
225 err_exit(
"Need to specify path to language files --langdata_dir")
226 if not ctx.tessdata_dir:
227 tessdata_prefix = os.environ.get(
"TESSDATA_PREFIX",
"")
228 if not tessdata_prefix:
230 "Need to specify a --tessdata_dir or have a "
231 "TESSDATA_PREFIX variable defined in your environment"
234 ctx.tessdata_dir = tessdata_prefix
235 if not ctx.output_dir:
236 ctx.output_dir = mkdtemp(prefix=f
"trained-{ctx.lang_code}-{ctx.timestamp}")
237 log.info(f
"Output directory set to: {ctx.output_dir}")
241 ctx.training_dir = mkdtemp(prefix=f
"{ctx.lang_code}-{ctx.timestamp}")
243 ctx.training_dir = mkdtemp(prefix=f
"{ctx.lang_code}-{ctx.timestamp}", dir=ctx.tmp_dir)
245 ctx.log_file = pathlib.Path(ctx.training_dir) /
"tesstrain.log"
246 log.info(f
"Log file location: {ctx.log_file}")
248 def show_tmpdir_location(training_dir):
251 if pathlib.Path(training_dir).exists():
252 print(f
"Temporary files retained at: {training_dir}")
254 atexit.register(show_tmpdir_location, ctx.training_dir)
258 if not ctx.training_text:
259 ctx.training_text = (
260 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f
"{ctx.lang_code}.training_text"
262 if not ctx.wordlist_file:
263 ctx.wordlist_file = (
264 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f
"{ctx.lang_code}.wordlist"
267 ctx.word_bigrams_file = (
268 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f
"{ctx.lang_code}.word.bigrams"
271 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f
"{ctx.lang_code}.numbers"
273 ctx.punc_file = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f
"{ctx.lang_code}.punc"
274 ctx.bigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
275 ".training_text.bigram_freqs"
277 ctx.unigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
278 ".training_text.unigram_freqs"
280 ctx.train_ngrams_file = pathlib.Path(ctx.training_text).with_suffix(
281 ".training_text.train_ngrams"
283 ctx.generate_dawgs = 1
289 shutil.copy(ctx.log_file, ctx.output_dir)
290 shutil.rmtree(ctx.training_dir)
296 sample_path = pathlib.Path(ctx.font_config_cache) /
"sample_text.txt"
297 pathlib.Path(sample_path).write_text(
"Text\n")
298 log.info(f
"Testing font: {ctx.fonts[0]}")
301 f
"--fonts_dir={ctx.fonts_dir}",
302 f
"--font={ctx.fonts[0]}",
303 f
"--outputbase={sample_path}",
304 f
"--text={sample_path}",
305 f
"--fontconfig_tmpdir={ctx.font_config_cache}",
310 return font.replace(
" ",
"_").replace(
",",
"")
314 return pathlib.Path(ctx.training_dir) / f
"{ctx.lang_code}.{fontname}.exp{exposure}"
320 log.info(f
"Rendering using {font}")
325 f
"--fontconfig_tmpdir={ctx.font_config_cache}",
326 f
"--fonts_dir={ctx.fonts_dir}",
327 f
"--strip_unrenderable_words",
328 f
"--leading={ctx.leading}",
329 f
"--char_spacing={char_spacing}",
330 f
"--exposure={exposure}",
331 f
"--outputbase={outbase}",
332 f
"--max_pages={ctx.max_pages}",
335 if ctx.distort_image:
336 common_args.append(
"--distort_image")
340 if font
in VERTICAL_FONTS:
341 common_args.append(
"--writing_mode=vertical-upright")
347 f
"--text={ctx.training_text}",
348 *ctx.text2image_extra_args,
353 if ctx.extract_font_properties
and pathlib.Path(ctx.train_ngrams_file).exists():
354 log.info(f
"Extracting font properties of {font}")
359 f
"--ligatures=false",
360 f
"--text={ctx.train_ngrams_file}",
361 f
"--only_extract_font_properties",
365 return f
"{font}-{exposure}"
370 if not par_factor
or par_factor <= 0:
373 log.info(
"=== Phase I: Generating training images ===")
377 for exposure
in ctx.exposures:
378 if ctx.extract_font_properties
and pathlib.Path(ctx.bigram_freqs_file).exists():
382 lines = pathlib.Path(ctx.bigram_freqs_file).read_text(encoding=
"utf-8").split(
"\n")
383 records = (line.split()
for line
in lines)
385 ngram_frac = p * sum(
int(rec[1])
for rec
in records
if len(rec) >= 2)
387 with pathlib.Path(ctx.train_ngrams_file).open(
"w", encoding=
"utf-8")
as f:
389 for bigram, count
in sorted(records, key=itemgetter(1), reverse=
True):
390 if cumsum > ngram_frac:
392 f.write(bigram +
" ")
399 )
as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=par_factor)
as executor:
401 executor.submit(generate_font_image, ctx, font, exposure, char_spacing)
402 for font
in ctx.fonts
404 for future
in concurrent.futures.as_completed(futures):
407 except Exception
as exc:
408 err_exit(
"Failed while generating images " + str(exc))
413 for font
in ctx.fonts:
422 log.info(
"=== Phase UP: Generating unicharset and unichar properties files ===")
424 box_files = pathlib.Path(ctx.training_dir).glob(
"*.box")
426 ctx.unicharset_file = pathlib.Path(ctx.training_dir) / f
"{ctx.lang_code}.unicharset"
429 "unicharset_extractor",
430 "--output_unicharset",
431 f
"{ctx.unicharset_file}",
438 ctx.xheights_file = pathlib.Path(ctx.training_dir) / f
"{ctx.lang_code}.xheights"
440 "set_unicharset_properties",
442 f
"{ctx.unicharset_file}",
444 f
"{ctx.unicharset_file}",
446 f
"{ctx.xheights_file}",
447 f
"--script_dir={ctx.langdata_dir}",
526 log.info(f
"=== Phase E: Generating {ext} files ===")
528 img_files = list(pathlib.Path(ctx.training_dir).glob(
"*.exp*.tif"))
533 testconfig = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f
"{ctx.lang_code}.config"
534 if testconfig.exists():
536 log.info(f
"Using {ctx.lang_code}.config")
538 tessdata_environ = os.environ.copy()
539 tessdata_environ[
"TESSDATA_PREFIX"] = str(ctx.tessdata_dir)
541 log.info(f
"Using TESSDATA_PREFIX={tessdata_environ['TESSDATA_PREFIX']}")
543 with tqdm(total=len(img_files))
as pbar, concurrent.futures.ThreadPoolExecutor(
547 for img_file
in img_files:
548 future = executor.submit(
552 pathlib.Path(img_file).with_suffix(
""),
555 env=tessdata_environ,
557 futures.append(future)
559 for future
in concurrent.futures.as_completed(futures):
562 except Exception
as exc:
563 err_exit(
"Failed while extracting features: " + str(exc))
567 for img_file
in img_files:
654 log.info(
"=== Constructing LSTM training data ===")
655 lang_prefix = f
"{ctx.langdata_dir}/{ctx.lang_code}/{ctx.lang_code}"
656 path_output = pathlib.Path(ctx.output_dir)
657 if not path_output.is_dir():
658 log.info(f
"Creating new directory {ctx.output_dir}")
659 path_output.mkdir(exist_ok=
True, parents=
True)
663 args.append(
"--lang_is_rtl")
664 if ctx.norm_mode >= 2:
665 args.append(
"--pass_through_recoder")
669 "combine_lang_model",
670 "--input_unicharset",
671 f
"{ctx.training_dir}/{ctx.lang_code}.unicharset",
673 f
"{ctx.langdata_dir}",
675 f
"{lang_prefix}.wordlist",
677 f
"{lang_prefix}.numbers",
679 f
"{lang_prefix}.punc",
688 training_path = pathlib.Path(ctx.training_dir)
689 if ctx.save_box_tiff:
690 log.info(
"=== Saving box/tiff pairs for training data ===")
691 yield from training_path.glob(f
"{ctx.lang_code}*.box")
692 yield from training_path.glob(f
"{ctx.lang_code}*.tif")
693 log.info(
"=== Moving lstmf files for training data ===")
694 yield from training_path.glob(f
"{ctx.lang_code}.*.lstmf")
696 for f
in get_file_list():
697 log.debug(f
"Moving {f} to {path_output / f.name}")
698 shutil.move(str(f), path_output / f.name)
700 lstm_list = f
"{ctx.output_dir}/{ctx.lang_code}.training_files.txt"
701 dir_listing = (str(p)
for p
in path_output.glob(f
"{ctx.lang_code}.*.lstmf"))
702 pathlib.Path(lstm_list).write_text(
"\n".join(dir_listing))