Simplify code

8269a94d · Solene Tarride · Solene Tarride · e972ac44 · 8269a94d · 8269a94d
Commit 8269a94d authored 1 year ago by Solene Tarride Committed by Solene Tarride 1 year ago
--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -33,6 +33,7 @@ from dan.datasets.extract.utils import (
    Tokenizer,
    download_image,
    get_bbox,
+    get_vocabulary,
    insert_token,
    normalize_linebreaks,
    normalize_spaces,
@@ -363,14 +364,13 @@ class ArkindexExtractor:
            self.language_tokens.append(
                self.mapping.encode[token]
            ) if token in self.mapping.encode else self.language_tokens.append(token)
-
        self.language_tokens.append(self.mapping.ctc.encoded)
-        assert all(
-            [len(token) == 1 for token in self.language_lexicon]
-        ), "Tokens should be single characters."

        # Build LM corpus
-        train_corpus = [text.replace("\n", " ") for text in self.data["train"].values()]
+        train_corpus = [
+            text.replace(self.mapping.linebreak.display, self.mapping.space.display)
+            for text in self.data["train"].values()
+        ]
        tokenizer = Tokenizer(
            train_corpus,
            outdir=self.output / "language_model",
@@ -388,36 +388,18 @@ class ArkindexExtractor:
            tokenizer.subword_tokenize(doc) for doc in train_corpus
        ]

-        # Build vocabulary
-        word_vocabulary = set(
-            [
-                word
-                for doc in self.language_corpus["words"]
-                for word in doc.split()
-                if word != ""
-            ]
-        )
-        subword_vocabulary = set(
-            [
-                subword
-                for doc in self.language_corpus["subwords"]
-                for subword in doc.split()
-                if subword != ""
-            ]
-        )
-
        # Build LM lexicon
        self.language_lexicon["characters"] = [
            f"{token} {token}" for token in self.language_tokens
        ]
        self.language_lexicon["words"] = [
            f"{word} {tokenizer.char_tokenize(word)}"
-            for word in sorted(word_vocabulary)
+            for word in get_vocabulary(self.language_corpus["words"])
            if word != ""
        ]
        self.language_lexicon["subwords"] = [
            f"{subword} {tokenizer.char_tokenize(subword)}"
-            for subword in sorted(subword_vocabulary)
+            for subword in get_vocabulary(self.language_corpus["subwords"])
        ]

    def export(self):

--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -122,12 +122,22 @@ def get_bbox(polygon: List[List[int]]) -> str:
    return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))


+def get_vocabulary(tokenized_text: List[str]) -> set[str]:
+    """
+    Compute set of vocabulary from tokenzied text.
+    :param tokenized_text: List of tokenized text.
+    """
+    return sorted(
+        set([token for doc in tokenized_text for token in doc.split() if token != ""])
+    )
+
+
 class Tokenizer:
    """
    A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
    :param training_corpus: List of training text.
    :param outdir: Path to save the subword tokenizer.
-    :param mapping: Mapping between displayed and encoded versions of special characters
+    :param mapping: Mapping between displayed and encoded versions of special characters.
    :param tokens: Start and end tokens used to represent named entities.
    :param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
    """