diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index 9e0706929dfc4de96f20296f775aeb263c6e9617..2befb801b680b89328c99eb714f0e3733802f223 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -14,7 +14,6 @@ import cv2 import numpy as np from PIL import Image from tqdm import tqdm -from nltk.tokenize import wordpunct_tokenize from arkindex_export import open_database from dan.datasets.extract.db import ( @@ -31,12 +30,12 @@ from dan.datasets.extract.exceptions import ( UnknownTokenInText, ) from dan.datasets.extract.utils import ( + Tokenizer, download_image, get_bbox, insert_token, normalize_linebreaks, normalize_spaces, - Tokenizer, ) from dan.utils import EntityType, LMTokenMapping, parse_tokens from line_image_extractor.extractor import extract @@ -371,20 +370,53 @@ class ArkindexExtractor: # Build LM corpus train_corpus = [text for text in self.data["train"].values()] - tokenizer = Tokenizer(train_corpus, outdir=self.output / "language_model", mapping=self.mapping, tokens=self.tokens) - tokenizer.train_subword_tokenizer() - self.language_corpus["characters"] = [tokenizer.char_tokenize(doc) for doc in train_corpus] - self.language_corpus["words"] = [tokenizer.word_tokenize(doc) for doc in train_corpus] - self.language_corpus["subwords"] = [tokenizer.subword_tokenize(doc) for doc in train_corpus] + tokenizer = Tokenizer( + train_corpus, + outdir=self.output / "language_model", + mapping=self.mapping, + tokens=self.tokens, + ) + self.language_corpus["characters"] = [ + tokenizer.char_tokenize(doc) for doc in train_corpus + ] + self.language_corpus["words"] = [ + tokenizer.word_tokenize(doc) for doc in train_corpus + ] + self.language_corpus["subwords"] = [ + tokenizer.subword_tokenize(doc) for doc in train_corpus + ] # Build vocabulary - word_vocabulary = set([word for doc in self.language_corpus["words"] for word in doc.split(" ")]) - subword_vocabulary = set([subword for doc in self.language_corpus["subwords"] for subword in doc.split(" ")]) + word_vocabulary = set( + [ + word + for doc in self.language_corpus["words"] + for word in doc.split() + if word != "" + ] + ) + subword_vocabulary = set( + [ + subword + for doc in self.language_corpus["subwords"] + for subword in doc.split() + if subword != "" + ] + ) # Build LM lexicon - self.language_lexicon["chars"] = [f"{token} {tokenizer.char_tokenize(token)}" for token in self.language_tokens] - self.language_lexicon["words"] = [f"{word} {tokenizer.char_tokenize(word)}" for word in word_vocabulary] - self.language_lexicon["subwords"] = [f"{subword} {tokenizer.char_tokenize(subword)}" for subword in subword_vocabulary] + self.language_lexicon["characters"] = [ + f"{token} {token}" for token in self.language_tokens + ] + self.language_lexicon["words"] = [ + f"{word} {tokenizer.char_tokenize(word)}" + for word in sorted(word_vocabulary) + if word != "" + ] + self.language_lexicon["subwords"] = [ + f"{subword} {tokenizer.char_tokenize(subword)}" + for subword in sorted(subword_vocabulary) + ] def export(self): (self.output / "labels.json").write_text( diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 7b92d773152f1f57336f89b487a04d9c91b6d815..a5f7c4683aaa2d98c1ab6b9c3f56563f2c7ce08b 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -2,9 +2,12 @@ import logging import re from io import BytesIO +from pathlib import Path from typing import List import requests +import sentencepiece as spm +from nltk import wordpunct_tokenize from PIL import Image, ImageOps from tenacity import ( retry, @@ -12,10 +15,8 @@ from tenacity import ( stop_after_attempt, wait_exponential, ) -from pathlib import Path -from dan.utils import EntityType -import sentencepiece as spm -from nltk import wordpunct_tokenize + +from dan.utils import EntityType, LMTokenMapping logger = logging.getLogger(__name__) @@ -119,68 +120,97 @@ def get_bbox(polygon: List[List[int]]) -> str: return ",".join(list(map(str, [int(x), int(y), int(width), int(height)]))) -class Tokenizer(): - """ +class Tokenizer: + """ A multi-level tokenizer (char, subword, word) Subword tokenizer is trained using sentencepiece. """ - def __init__(self, training_corpus, outdir, mapping, tokens=[]) -> None: + + def __init__( + self, + training_corpus: List[str], + outdir: Path, + mapping: LMTokenMapping, + tokens: EntityType = None, + subword_vocab_size: int = 1000, + ) -> None: self.corpus = training_corpus self.outdir = outdir self.prefix = f"{self.outdir}/subword_tokenizer" - self.sentencepiece_model = None - self.mapping = mapping self.tokens = tokens + self.mapping = mapping + # Train the subword tokenizer + self.user_subword_vocab_size = subword_vocab_size + self.sentencepiece_model = self.train_subword_tokenizer() @property - def ner_tokens(self): - return [entity.start for entity in self.tokens.values()] + [entity.end for entity in self.tokens.values() if entity.end != ""] + def ner_tokens(self) -> List[str]: + if self.tokens is None: + return [] + return [entity.start for entity in self.tokens.values()] + [ + entity.end for entity in self.tokens.values() if entity.end != "" + ] @property - def mapping_tokens(self): + def mapping_tokens(self) -> List[str]: return [token.encoded for token in self.mapping] - + @property - def special_tokens(self): + def special_tokens(self) -> List[str]: return list(set(self.ner_tokens + self.mapping_tokens)) + @property + def subword_vocab_size(self): + n_words = len(set([word for doc in self.corpus for word in doc.split()])) + return min(self.user_subword_vocab_size, 3 * n_words) + def train_subword_tokenizer(self): - """ + """ Train a sentencepiece model on the training corpus. """ # Write the corpus in a text file - corpus_file = Path(self.outdir / f"tmp_training_corpus.txt") + corpus_file = Path(self.outdir / "tmp.txt") corpus_file.write_text("\n".join(self.corpus)) # Train the tokenizer and load it logger.info("Training sentencepiece model for subword tokenization") - spm.SentencePieceTrainer.train(input=str(corpus_file), vocab_size=1000, model_prefix=self.prefix, user_defined_symbols=self.special_tokens) + spm.SentencePieceTrainer.train( + input=str(corpus_file), + vocab_size=self.subword_vocab_size, + model_prefix=self.prefix, + user_defined_symbols=self.special_tokens, + ) # Delete the corpus file corpus_file.unlink() - # Load the corpus - self.sentencepiece_model = spm.SentencePieceProcessor(model_file=f"{self.prefix}.model") + # Load the model and return it + return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model") - def subword_tokenize(self, text: str, enable_sampling=True, alpha=0.1, nbest_size=-1) -> List[str]: - """ - Tokenize into subwords. As sampling is enabled, a text can be tokenized in different ways. + def subword_tokenize(self, text: str) -> List[str]: + """ + Tokenize into subwords. Sampling is disabled to ensure reproducibility. """ - tokens = self.sentencepiece_model.encode(text, out_type=str, enable_sampling=enable_sampling, alpha=alpha, nbest_size=nbest_size) - # Replace special sentencepiece space token - tokens = [t.replace("â–", "⎵") for t in tokens] + tokens = self.sentencepiece_model.encode(text, out_type=str) # Return encoded tokenized text return " ".join(["".join(self.encode(subword)) for subword in tokens]) def word_tokenize(self, text: str) -> List[str]: - """ + """ Tokenize text into words - Spaces (⎵) and NER tokens are considered as distinct words. + Spaces (⎵) and NER tokens are considered as words. """ words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)] - words = " ".join([word + " ⎵" if (i != len(words) - 1 and word not in self.ner_tokens) else word for i, word in enumerate(words)]) + words = " ".join( + [ + word + f" {self.mapping.space.encoded}" + if (i != len(words) - 1 and word not in self.ner_tokens) + else word + for i, word in enumerate(words) + ] + ) return words - + def char_tokenize(self, text: str) -> List[str]: """ Tokenize text into characters @@ -188,5 +218,7 @@ class Tokenizer(): return " ".join(self.encode(list(text))) def encode(self, text: List[str]) -> List[str]: + """ + Encode special tokens + """ return map(self.mapping.encode_token, text) -