diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index 8f1d1e46fce5d77e53eb9733e7d952faebc618ce..9e0706929dfc4de96f20296f775aeb263c6e9617 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -14,6 +14,7 @@ import cv2 import numpy as np from PIL import Image from tqdm import tqdm +from nltk.tokenize import wordpunct_tokenize from arkindex_export import open_database from dan.datasets.extract.db import ( @@ -35,6 +36,7 @@ from dan.datasets.extract.utils import ( insert_token, normalize_linebreaks, normalize_spaces, + Tokenizer, ) from dan.utils import EntityType, LMTokenMapping, parse_tokens from line_image_extractor.extractor import extract @@ -97,9 +99,9 @@ class ArkindexExtractor: self.data: Dict = defaultdict(dict) self.charset = set() - self.language_corpus = [] + self.language_corpus = defaultdict(list) self.language_tokens = [] - self.language_lexicon = [] + self.language_lexicon = defaultdict(list) # Image download tasks to process self.tasks: List[Dict[str, str]] = [] @@ -275,12 +277,6 @@ class ArkindexExtractor: ) return text.strip() - def format_text_language_model(self, text: str): - """ - Format text for the language model. Return the text tokenized at character-level. - """ - return " ".join(map(self.mapping.encode_token, list(text.strip()))) - def process_element( self, element: Element, @@ -319,14 +315,6 @@ class ArkindexExtractor: self.data[split][str(image_path)] = text self.charset = self.charset.union(set(text)) - # Language model should be built using only text from the training set - if split == "train": - self.language_corpus.append(self.format_text_language_model(text)) - - # Language model should be built using only text from the training set - if split == "train": - self.language_corpus.append(self.format_text_language_model(text)) - def process_parent( self, pbar, @@ -365,6 +353,9 @@ class ArkindexExtractor: """ Convert charset to a LM-compatible charset. Ensure that special LM tokens do not appear in the charset. """ + logger.info("Preparing language resources") + + # Build LM tokens for token in sorted(list(self.charset)): assert ( token not in self.mapping.encode.values() @@ -373,14 +364,27 @@ class ArkindexExtractor: self.mapping.encode[token] ) if token in self.mapping.encode else self.language_tokens.append(token) - # Add the special blank token self.language_tokens.append(self.mapping.ctc.encoded) - - # Build lexicon assert all( [len(token) == 1 for token in self.language_lexicon] ), "Tokens should be single characters." - self.language_lexicon = [f"{token} {token}" for token in self.language_tokens] + + # Build LM corpus + train_corpus = [text for text in self.data["train"].values()] + tokenizer = Tokenizer(train_corpus, outdir=self.output / "language_model", mapping=self.mapping, tokens=self.tokens) + tokenizer.train_subword_tokenizer() + self.language_corpus["characters"] = [tokenizer.char_tokenize(doc) for doc in train_corpus] + self.language_corpus["words"] = [tokenizer.word_tokenize(doc) for doc in train_corpus] + self.language_corpus["subwords"] = [tokenizer.subword_tokenize(doc) for doc in train_corpus] + + # Build vocabulary + word_vocabulary = set([word for doc in self.language_corpus["words"] for word in doc.split(" ")]) + subword_vocabulary = set([subword for doc in self.language_corpus["subwords"] for subword in doc.split(" ")]) + + # Build LM lexicon + self.language_lexicon["chars"] = [f"{token} {tokenizer.char_tokenize(token)}" for token in self.language_tokens] + self.language_lexicon["words"] = [f"{word} {tokenizer.char_tokenize(word)}" for word in word_vocabulary] + self.language_lexicon["subwords"] = [f"{subword} {tokenizer.char_tokenize(subword)}" for subword in subword_vocabulary] def export(self): (self.output / "labels.json").write_text( @@ -390,15 +394,16 @@ class ArkindexExtractor: indent=4, ) ) - (self.output / "language_model" / "corpus.txt").write_text( - "\n".join(self.language_corpus) - ) + for level in ["characters", "words", "subwords"]: + (self.output / "language_model" / f"corpus_{level}.txt").write_text( + "\n".join(self.language_corpus[level]) + ) + (self.output / "language_model" / f"lexicon_{level}.txt").write_text( + "\n".join(self.language_lexicon[level]) + ) (self.output / "language_model" / "tokens.txt").write_text( "\n".join(self.language_tokens) ) - (self.output / "language_model" / "lexicon.txt").write_text( - "\n".join(self.language_lexicon) - ) (self.output / "charset.pkl").write_bytes( pickle.dumps(sorted(list(self.charset))) ) diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 9877b6cecb01ecf16e74ba3e391f9a1ca42cbdf9..7b92d773152f1f57336f89b487a04d9c91b6d815 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -12,8 +12,10 @@ from tenacity import ( stop_after_attempt, wait_exponential, ) - +from pathlib import Path from dan.utils import EntityType +import sentencepiece as spm +from nltk import wordpunct_tokenize logger = logging.getLogger(__name__) @@ -115,3 +117,76 @@ def get_bbox(polygon: List[List[int]]) -> str: x, y = min(all_x), min(all_y) width, height = max(all_x) - x, max(all_y) - y return ",".join(list(map(str, [int(x), int(y), int(width), int(height)]))) + + +class Tokenizer(): + """ + A multi-level tokenizer (char, subword, word) + Subword tokenizer is trained using sentencepiece. + """ + def __init__(self, training_corpus, outdir, mapping, tokens=[]) -> None: + self.corpus = training_corpus + self.outdir = outdir + self.prefix = f"{self.outdir}/subword_tokenizer" + self.sentencepiece_model = None + self.mapping = mapping + self.tokens = tokens + + @property + def ner_tokens(self): + return [entity.start for entity in self.tokens.values()] + [entity.end for entity in self.tokens.values() if entity.end != ""] + + @property + def mapping_tokens(self): + return [token.encoded for token in self.mapping] + + @property + def special_tokens(self): + return list(set(self.ner_tokens + self.mapping_tokens)) + + def train_subword_tokenizer(self): + """ + Train a sentencepiece model on the training corpus. + """ + # Write the corpus in a text file + corpus_file = Path(self.outdir / f"tmp_training_corpus.txt") + corpus_file.write_text("\n".join(self.corpus)) + + # Train the tokenizer and load it + logger.info("Training sentencepiece model for subword tokenization") + spm.SentencePieceTrainer.train(input=str(corpus_file), vocab_size=1000, model_prefix=self.prefix, user_defined_symbols=self.special_tokens) + + # Delete the corpus file + corpus_file.unlink() + + # Load the corpus + self.sentencepiece_model = spm.SentencePieceProcessor(model_file=f"{self.prefix}.model") + + def subword_tokenize(self, text: str, enable_sampling=True, alpha=0.1, nbest_size=-1) -> List[str]: + """ + Tokenize into subwords. As sampling is enabled, a text can be tokenized in different ways. + """ + tokens = self.sentencepiece_model.encode(text, out_type=str, enable_sampling=enable_sampling, alpha=alpha, nbest_size=nbest_size) + # Replace special sentencepiece space token + tokens = [t.replace("â–", "⎵") for t in tokens] + # Return encoded tokenized text + return " ".join(["".join(self.encode(subword)) for subword in tokens]) + + def word_tokenize(self, text: str) -> List[str]: + """ + Tokenize text into words + Spaces (⎵) and NER tokens are considered as distinct words. + """ + words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)] + words = " ".join([word + " ⎵" if (i != len(words) - 1 and word not in self.ner_tokens) else word for i, word in enumerate(words)]) + return words + + def char_tokenize(self, text: str) -> List[str]: + """ + Tokenize text into characters + """ + return " ".join(self.encode(list(text))) + + def encode(self, text: List[str]) -> List[str]: + return map(self.mapping.encode_token, text) +