Skip to content
Snippets Groups Projects

Do not decompose special tokens, remove useless pbar management

Merged Yoann Schneider requested to merge do-not-decompose-special-tokens into master
4 files
+ 31
23
Compare changes
  • Side-by-side
  • Inline
Files
4
@@ -10,7 +10,7 @@ from atr_data_generator.extract.pylaia import (
TOKEN_NAME,
UNK_CHAR,
)
from atr_data_generator.extract.pylaia.utils import _merge
from atr_data_generator.extract.pylaia.utils import _merge, _string_decompose
class Syms:
@@ -60,7 +60,9 @@ class Syms:
# * character-based language model: "c c"
# Note: special tokens like <ctc>, <space> or <unk> should NOT be decomposed in < c t c >
(output / LEXICON_NAME).write_text(
"\n".join(_merge(extended_symbols, list(map(" ".join, extended_symbols))))
"\n".join(
_merge(extended_symbols, list(map(_string_decompose, extended_symbols)))
)
)
# Character occurrences.
Loading