Skip to content
Snippets Groups Projects

Support subword and word language models

Merged Solene Tarride requested to merge subword-and-word-lm into main
All threads resolved!
Compare and Show latest version
7 files
+ 67
118394
Compare changes
  • Side-by-side
  • Inline
Files
7
@@ -355,6 +355,8 @@ class ArkindexExtractor:
Convert charset to a LM-compatible charset. Ensure that special LM tokens do not appear in the charset.
"""
logger.info("Preparing language resources")
# Add unknown token to charset
self.charset.add(self.unknown_token)
# Build LM tokens
for token in sorted(list(self.charset)):
@@ -365,14 +367,18 @@ class ArkindexExtractor:
self.mapping.encode[token]
) if token in self.mapping.encode else self.language_tokens.append(token)
self.language_tokens.append(self.mapping.ctc.encoded)
self.language_tokens.append(self.unknown_token)
# Build LM corpus
train_corpus = [
text.replace(self.mapping.linebreak.display, self.mapping.space.display)
for text in self.data["train"].values()
]
tokenizer = Tokenizer(
training_corpus=train_corpus,
charset=self.language_tokens,
unknown_token=self.unknown_token,
outdir=self.output / "language_model",
mapping=self.mapping,
tokens=self.tokens,
Loading