Skip to content
Snippets Groups Projects

Support subword and word language models

Merged Solene Tarride requested to merge subword-and-word-lm into main
2 files
+ 106
42
Compare changes
  • Side-by-side
  • Inline
Files
2
@@ -14,7 +14,6 @@ import cv2
import numpy as np
from PIL import Image
from tqdm import tqdm
from nltk.tokenize import wordpunct_tokenize
from arkindex_export import open_database
from dan.datasets.extract.db import (
@@ -31,12 +30,12 @@ from dan.datasets.extract.exceptions import (
UnknownTokenInText,
)
from dan.datasets.extract.utils import (
Tokenizer,
download_image,
get_bbox,
insert_token,
normalize_linebreaks,
normalize_spaces,
Tokenizer,
)
from dan.utils import EntityType, LMTokenMapping, parse_tokens
from line_image_extractor.extractor import extract
@@ -371,20 +370,53 @@ class ArkindexExtractor:
# Build LM corpus
train_corpus = [text for text in self.data["train"].values()]
tokenizer = Tokenizer(train_corpus, outdir=self.output / "language_model", mapping=self.mapping, tokens=self.tokens)
tokenizer.train_subword_tokenizer()
self.language_corpus["characters"] = [tokenizer.char_tokenize(doc) for doc in train_corpus]
self.language_corpus["words"] = [tokenizer.word_tokenize(doc) for doc in train_corpus]
self.language_corpus["subwords"] = [tokenizer.subword_tokenize(doc) for doc in train_corpus]
tokenizer = Tokenizer(
train_corpus,
outdir=self.output / "language_model",
mapping=self.mapping,
tokens=self.tokens,
)
self.language_corpus["characters"] = [
tokenizer.char_tokenize(doc) for doc in train_corpus
]
self.language_corpus["words"] = [
tokenizer.word_tokenize(doc) for doc in train_corpus
]
self.language_corpus["subwords"] = [
tokenizer.subword_tokenize(doc) for doc in train_corpus
]
# Build vocabulary
word_vocabulary = set([word for doc in self.language_corpus["words"] for word in doc.split(" ")])
subword_vocabulary = set([subword for doc in self.language_corpus["subwords"] for subword in doc.split(" ")])
word_vocabulary = set(
[
word
for doc in self.language_corpus["words"]
for word in doc.split()
if word != ""
]
)
subword_vocabulary = set(
[
subword
for doc in self.language_corpus["subwords"]
for subword in doc.split()
if subword != ""
]
)
# Build LM lexicon
self.language_lexicon["chars"] = [f"{token} {tokenizer.char_tokenize(token)}" for token in self.language_tokens]
self.language_lexicon["words"] = [f"{word} {tokenizer.char_tokenize(word)}" for word in word_vocabulary]
self.language_lexicon["subwords"] = [f"{subword} {tokenizer.char_tokenize(subword)}" for subword in subword_vocabulary]
self.language_lexicon["characters"] = [
f"{token} {token}" for token in self.language_tokens
]
self.language_lexicon["words"] = [
f"{word} {tokenizer.char_tokenize(word)}"
for word in sorted(word_vocabulary)
if word != ""
]
self.language_lexicon["subwords"] = [
f"{subword} {tokenizer.char_tokenize(subword)}"
for subword in sorted(subword_vocabulary)
]
def export(self):
(self.output / "labels.json").write_text(
Loading