Skip to content
Snippets Groups Projects
Commit 26cb20bf authored by Solene Tarride's avatar Solene Tarride
Browse files

Prepare language files for word and subword LM

parent 21f27601
No related branches found
No related tags found
No related merge requests found
This commit is part of merge request !287. Comments created here will be created in the context of that merge request.
......@@ -14,6 +14,7 @@ import cv2
import numpy as np
from PIL import Image
from tqdm import tqdm
from nltk.tokenize import wordpunct_tokenize
from arkindex_export import open_database
from dan.datasets.extract.db import (
......@@ -35,6 +36,7 @@ from dan.datasets.extract.utils import (
insert_token,
normalize_linebreaks,
normalize_spaces,
Tokenizer,
)
from dan.utils import EntityType, LMTokenMapping, parse_tokens
from line_image_extractor.extractor import extract
......@@ -97,9 +99,9 @@ class ArkindexExtractor:
self.data: Dict = defaultdict(dict)
self.charset = set()
self.language_corpus = []
self.language_corpus = defaultdict(list)
self.language_tokens = []
self.language_lexicon = []
self.language_lexicon = defaultdict(list)
# Image download tasks to process
self.tasks: List[Dict[str, str]] = []
......@@ -275,12 +277,6 @@ class ArkindexExtractor:
)
return text.strip()
def format_text_language_model(self, text: str):
"""
Format text for the language model. Return the text tokenized at character-level.
"""
return " ".join(map(self.mapping.encode_token, list(text.strip())))
def process_element(
self,
element: Element,
......@@ -319,14 +315,6 @@ class ArkindexExtractor:
self.data[split][str(image_path)] = text
self.charset = self.charset.union(set(text))
# Language model should be built using only text from the training set
if split == "train":
self.language_corpus.append(self.format_text_language_model(text))
# Language model should be built using only text from the training set
if split == "train":
self.language_corpus.append(self.format_text_language_model(text))
def process_parent(
self,
pbar,
......@@ -365,6 +353,9 @@ class ArkindexExtractor:
"""
Convert charset to a LM-compatible charset. Ensure that special LM tokens do not appear in the charset.
"""
logger.info("Preparing language resources")
# Build LM tokens
for token in sorted(list(self.charset)):
assert (
token not in self.mapping.encode.values()
......@@ -373,14 +364,27 @@ class ArkindexExtractor:
self.mapping.encode[token]
) if token in self.mapping.encode else self.language_tokens.append(token)
# Add the special blank token
self.language_tokens.append(self.mapping.ctc.encoded)
# Build lexicon
assert all(
[len(token) == 1 for token in self.language_lexicon]
), "Tokens should be single characters."
self.language_lexicon = [f"{token} {token}" for token in self.language_tokens]
# Build LM corpus
train_corpus = [text for text in self.data["train"].values()]
tokenizer = Tokenizer(train_corpus, outdir=self.output / "language_model", mapping=self.mapping, tokens=self.tokens)
tokenizer.train_subword_tokenizer()
self.language_corpus["characters"] = [tokenizer.char_tokenize(doc) for doc in train_corpus]
self.language_corpus["words"] = [tokenizer.word_tokenize(doc) for doc in train_corpus]
self.language_corpus["subwords"] = [tokenizer.subword_tokenize(doc) for doc in train_corpus]
# Build vocabulary
word_vocabulary = set([word for doc in self.language_corpus["words"] for word in doc.split(" ")])
subword_vocabulary = set([subword for doc in self.language_corpus["subwords"] for subword in doc.split(" ")])
# Build LM lexicon
self.language_lexicon["chars"] = [f"{token} {tokenizer.char_tokenize(token)}" for token in self.language_tokens]
self.language_lexicon["words"] = [f"{word} {tokenizer.char_tokenize(word)}" for word in word_vocabulary]
self.language_lexicon["subwords"] = [f"{subword} {tokenizer.char_tokenize(subword)}" for subword in subword_vocabulary]
def export(self):
(self.output / "labels.json").write_text(
......@@ -390,15 +394,16 @@ class ArkindexExtractor:
indent=4,
)
)
(self.output / "language_model" / "corpus.txt").write_text(
"\n".join(self.language_corpus)
)
for level in ["characters", "words", "subwords"]:
(self.output / "language_model" / f"corpus_{level}.txt").write_text(
"\n".join(self.language_corpus[level])
)
(self.output / "language_model" / f"lexicon_{level}.txt").write_text(
"\n".join(self.language_lexicon[level])
)
(self.output / "language_model" / "tokens.txt").write_text(
"\n".join(self.language_tokens)
)
(self.output / "language_model" / "lexicon.txt").write_text(
"\n".join(self.language_lexicon)
)
(self.output / "charset.pkl").write_bytes(
pickle.dumps(sorted(list(self.charset)))
)
......
......@@ -12,8 +12,10 @@ from tenacity import (
stop_after_attempt,
wait_exponential,
)
from pathlib import Path
from dan.utils import EntityType
import sentencepiece as spm
from nltk import wordpunct_tokenize
logger = logging.getLogger(__name__)
......@@ -107,3 +109,76 @@ def get_bbox(polygon: List[List[int]]) -> str:
x, y = min(all_x), min(all_y)
width, height = max(all_x) - x, max(all_y) - y
return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
class Tokenizer():
"""
A multi-level tokenizer (char, subword, word)
Subword tokenizer is trained using sentencepiece.
"""
def __init__(self, training_corpus, outdir, mapping, tokens=[]) -> None:
self.corpus = training_corpus
self.outdir = outdir
self.prefix = f"{self.outdir}/subword_tokenizer"
self.sentencepiece_model = None
self.mapping = mapping
self.tokens = tokens
@property
def ner_tokens(self):
return [entity.start for entity in self.tokens.values()] + [entity.end for entity in self.tokens.values() if entity.end != ""]
@property
def mapping_tokens(self):
return [token.encoded for token in self.mapping]
@property
def special_tokens(self):
return list(set(self.ner_tokens + self.mapping_tokens))
def train_subword_tokenizer(self):
"""
Train a sentencepiece model on the training corpus.
"""
# Write the corpus in a text file
corpus_file = Path(self.outdir / f"tmp_training_corpus.txt")
corpus_file.write_text("\n".join(self.corpus))
# Train the tokenizer and load it
logger.info("Training sentencepiece model for subword tokenization")
spm.SentencePieceTrainer.train(input=str(corpus_file), vocab_size=1000, model_prefix=self.prefix, user_defined_symbols=self.special_tokens)
# Delete the corpus file
corpus_file.unlink()
# Load the corpus
self.sentencepiece_model = spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
def subword_tokenize(self, text: str, enable_sampling=True, alpha=0.1, nbest_size=-1) -> List[str]:
"""
Tokenize into subwords. As sampling is enabled, a text can be tokenized in different ways.
"""
tokens = self.sentencepiece_model.encode(text, out_type=str, enable_sampling=enable_sampling, alpha=alpha, nbest_size=nbest_size)
# Replace special sentencepiece space token
tokens = [t.replace("", "") for t in tokens]
# Return encoded tokenized text
return " ".join(["".join(self.encode(subword)) for subword in tokens])
def word_tokenize(self, text: str) -> List[str]:
"""
Tokenize text into words
Spaces (⎵) and NER tokens are considered as distinct words.
"""
words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)]
words = " ".join([word + "" if (i != len(words) - 1 and word not in self.ner_tokens) else word for i, word in enumerate(words)])
return words
def char_tokenize(self, text: str) -> List[str]:
"""
Tokenize text into characters
"""
return " ".join(self.encode(list(text)))
def encode(self, text: List[str]) -> List[str]:
return map(self.mapping.encode_token, text)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment