Skip to content
Snippets Groups Projects

Support subword and word language models

Merged Solene Tarride requested to merge subword-and-word-lm into main
All threads resolved!
2 files
+ 18
26
Compare changes
  • Side-by-side
  • Inline
Files
2
@@ -33,6 +33,7 @@ from dan.datasets.extract.utils import (
Tokenizer,
download_image,
get_bbox,
get_vocabulary,
insert_token,
normalize_linebreaks,
normalize_spaces,
@@ -363,14 +364,13 @@ class ArkindexExtractor:
self.language_tokens.append(
self.mapping.encode[token]
) if token in self.mapping.encode else self.language_tokens.append(token)
self.language_tokens.append(self.mapping.ctc.encoded)
assert all(
[len(token) == 1 for token in self.language_lexicon]
), "Tokens should be single characters."
# Build LM corpus
train_corpus = [text.replace("\n", " ") for text in self.data["train"].values()]
train_corpus = [
text.replace(self.mapping.linebreak.display, self.mapping.space.display)
for text in self.data["train"].values()
]
tokenizer = Tokenizer(
train_corpus,
outdir=self.output / "language_model",
@@ -388,36 +388,18 @@ class ArkindexExtractor:
tokenizer.subword_tokenize(doc) for doc in train_corpus
]
# Build vocabulary
word_vocabulary = set(
[
word
for doc in self.language_corpus["words"]
for word in doc.split()
if word != ""
]
)
subword_vocabulary = set(
[
subword
for doc in self.language_corpus["subwords"]
for subword in doc.split()
if subword != ""
]
)
# Build LM lexicon
self.language_lexicon["characters"] = [
f"{token} {token}" for token in self.language_tokens
]
self.language_lexicon["words"] = [
f"{word} {tokenizer.char_tokenize(word)}"
for word in sorted(word_vocabulary)
for word in get_vocabulary(self.language_corpus["words"])
if word != ""
]
self.language_lexicon["subwords"] = [
f"{subword} {tokenizer.char_tokenize(subword)}"
for subword in sorted(subword_vocabulary)
for subword in get_vocabulary(self.language_corpus["subwords"])
]
def export(self):
Loading