Skip to content
Snippets Groups Projects

Support subword and word language models

Merged Solene Tarride requested to merge subword-and-word-lm into main
Compare and Show latest version
3 files
+ 56
61
Compare changes
  • Side-by-side
  • Inline
Files
3
@@ -372,35 +372,29 @@ class ArkindexExtractor:
for text in self.data["train"].values()
]
tokenizer = Tokenizer(
train_corpus,
training_corpus=train_corpus,
outdir=self.output / "language_model",
mapping=self.mapping,
tokens=self.tokens,
subword_vocab_size=self.subword_vocab_size,
)
self.language_corpus["characters"] = [
tokenizer.char_tokenize(doc) for doc in train_corpus
]
self.language_corpus["words"] = [
tokenizer.word_tokenize(doc) for doc in train_corpus
]
self.language_corpus["subwords"] = [
tokenizer.subword_tokenize(doc) for doc in train_corpus
]
for level, tokenize in (
("characters", tokenizer.char_tokenize),
("words", tokenizer.word_tokenize),
("subwords", tokenizer.subword_tokenize),
):
self.language_corpus[level] = list(map(tokenize, train_corpus))
# Build LM lexicon
self.language_lexicon["characters"] = [
f"{token} {token}" for token in self.language_tokens
]
self.language_lexicon["words"] = [
f"{word} {tokenizer.char_tokenize(word)}"
for word in get_vocabulary(self.language_corpus["words"])
if word != ""
]
self.language_lexicon["subwords"] = [
f"{subword} {tokenizer.char_tokenize(subword)}"
for subword in get_vocabulary(self.language_corpus["subwords"])
]
for level in ["words", "subwords"]:
self.language_lexicon[level] = [
f"{token} {tokenizer.char_tokenize(token)}"
for token in get_vocabulary(self.language_corpus[level])
]
def export(self):
(self.output / "labels.json").write_text(
Loading