diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index 6b4b006eb3d5b917233049da75eb1a026e04a663..7478f0190751817fefec72943eb9a00ebd9c4007 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -33,6 +33,7 @@ from dan.datasets.extract.utils import ( Tokenizer, download_image, get_bbox, + get_vocabulary, insert_token, normalize_linebreaks, normalize_spaces, @@ -363,14 +364,13 @@ class ArkindexExtractor: self.language_tokens.append( self.mapping.encode[token] ) if token in self.mapping.encode else self.language_tokens.append(token) - self.language_tokens.append(self.mapping.ctc.encoded) - assert all( - [len(token) == 1 for token in self.language_lexicon] - ), "Tokens should be single characters." # Build LM corpus - train_corpus = [text.replace("\n", " ") for text in self.data["train"].values()] + train_corpus = [ + text.replace(self.mapping.linebreak.display, self.mapping.space.display) + for text in self.data["train"].values() + ] tokenizer = Tokenizer( train_corpus, outdir=self.output / "language_model", @@ -388,36 +388,18 @@ class ArkindexExtractor: tokenizer.subword_tokenize(doc) for doc in train_corpus ] - # Build vocabulary - word_vocabulary = set( - [ - word - for doc in self.language_corpus["words"] - for word in doc.split() - if word != "" - ] - ) - subword_vocabulary = set( - [ - subword - for doc in self.language_corpus["subwords"] - for subword in doc.split() - if subword != "" - ] - ) - # Build LM lexicon self.language_lexicon["characters"] = [ f"{token} {token}" for token in self.language_tokens ] self.language_lexicon["words"] = [ f"{word} {tokenizer.char_tokenize(word)}" - for word in sorted(word_vocabulary) + for word in get_vocabulary(self.language_corpus["words"]) if word != "" ] self.language_lexicon["subwords"] = [ f"{subword} {tokenizer.char_tokenize(subword)}" - for subword in sorted(subword_vocabulary) + for subword in get_vocabulary(self.language_corpus["subwords"]) ] def export(self): diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 9ed48b583bce63590f89930fd9f3cfc33c9b9baa..092257ee10f56ad38920f6bea1d39b0b2c080117 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -122,12 +122,22 @@ def get_bbox(polygon: List[List[int]]) -> str: return ",".join(list(map(str, [int(x), int(y), int(width), int(height)]))) +def get_vocabulary(tokenized_text: List[str]) -> set[str]: + """ + Compute set of vocabulary from tokenzied text. + :param tokenized_text: List of tokenized text. + """ + return sorted( + set([token for doc in tokenized_text for token in doc.split() if token != ""]) + ) + + class Tokenizer: """ A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece. :param training_corpus: List of training text. :param outdir: Path to save the subword tokenizer. - :param mapping: Mapping between displayed and encoded versions of special characters + :param mapping: Mapping between displayed and encoded versions of special characters. :param tokens: Start and end tokens used to represent named entities. :param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer. """