Fix linting

beee846d · Solene Tarride · 46d044cc · beee846d · beee846d
Commit beee846d authored 1 year ago by Solene Tarride
--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -14,7 +14,6 @@ import cv2
 import numpy as np
 from PIL import Image
 from tqdm import tqdm
-from nltk.tokenize import wordpunct_tokenize

 from arkindex_export import open_database
 from dan.datasets.extract.db import (
@@ -31,12 +30,12 @@ from dan.datasets.extract.exceptions import (
    UnknownTokenInText,
 )
 from dan.datasets.extract.utils import (
+    Tokenizer,
    download_image,
    get_bbox,
    insert_token,
    normalize_linebreaks,
    normalize_spaces,
-    Tokenizer,   
 )
 from dan.utils import EntityType, LMTokenMapping, parse_tokens
 from line_image_extractor.extractor import extract
@@ -371,20 +370,53 @@ class ArkindexExtractor:

        # Build LM corpus
        train_corpus = [text for text in self.data["train"].values()]
-        tokenizer = Tokenizer(train_corpus, outdir=self.output / "language_model", mapping=self.mapping, tokens=self.tokens)
-        tokenizer.train_subword_tokenizer()
-        self.language_corpus["characters"] = [tokenizer.char_tokenize(doc) for doc in train_corpus]
-        self.language_corpus["words"] = [tokenizer.word_tokenize(doc) for doc in train_corpus]
-        self.language_corpus["subwords"] = [tokenizer.subword_tokenize(doc) for doc in train_corpus]
+        tokenizer = Tokenizer(
+            train_corpus,
+            outdir=self.output / "language_model",
+            mapping=self.mapping,
+            tokens=self.tokens,
+        )
+        self.language_corpus["characters"] = [
+            tokenizer.char_tokenize(doc) for doc in train_corpus
+        ]
+        self.language_corpus["words"] = [
+            tokenizer.word_tokenize(doc) for doc in train_corpus
+        ]
+        self.language_corpus["subwords"] = [
+            tokenizer.subword_tokenize(doc) for doc in train_corpus
+        ]

        # Build vocabulary
-        word_vocabulary = set([word for doc in self.language_corpus["words"] for word in doc.split(" ")])
-        subword_vocabulary = set([subword for doc in self.language_corpus["subwords"] for subword in doc.split(" ")])
+        word_vocabulary = set(
+            [
+                word
+                for doc in self.language_corpus["words"]
+                for word in doc.split()
+                if word != ""
+            ]
+        )
+        subword_vocabulary = set(
+            [
+                subword
+                for doc in self.language_corpus["subwords"]
+                for subword in doc.split()
+                if subword != ""
+            ]
+        )

        # Build LM lexicon
-        self.language_lexicon["chars"] = [f"{token} {tokenizer.char_tokenize(token)}" for token in self.language_tokens]
-        self.language_lexicon["words"] = [f"{word} {tokenizer.char_tokenize(word)}" for word in word_vocabulary]
-        self.language_lexicon["subwords"] = [f"{subword} {tokenizer.char_tokenize(subword)}" for subword in subword_vocabulary]
+        self.language_lexicon["characters"] = [
+            f"{token} {token}" for token in self.language_tokens
+        ]
+        self.language_lexicon["words"] = [
+            f"{word} {tokenizer.char_tokenize(word)}"
+            for word in sorted(word_vocabulary)
+            if word != ""
+        ]
+        self.language_lexicon["subwords"] = [
+            f"{subword} {tokenizer.char_tokenize(subword)}"
+            for subword in sorted(subword_vocabulary)
+        ]

    def export(self):
        (self.output / "labels.json").write_text(

--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -2,9 +2,12 @@
 import logging
 import re
 from io import BytesIO
+from pathlib import Path
 from typing import List

 import requests
+import sentencepiece as spm
+from nltk import wordpunct_tokenize
 from PIL import Image, ImageOps
 from tenacity import (
    retry,
@@ -12,10 +15,8 @@ from tenacity import (
    stop_after_attempt,
    wait_exponential,
 )
-from pathlib import Path
-from dan.utils import EntityType
-import sentencepiece as spm
-from nltk import wordpunct_tokenize
+
+from dan.utils import EntityType, LMTokenMapping

 logger = logging.getLogger(__name__)

@@ -111,68 +112,97 @@ def get_bbox(polygon: List[List[int]]) -> str:
    return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))


-class Tokenizer():
-    """ 
+class Tokenizer:
+    """
    A multi-level tokenizer (char, subword, word)
    Subword tokenizer is trained using sentencepiece.
    """
-    def __init__(self, training_corpus, outdir, mapping, tokens=[]) -> None:
+
+    def __init__(
+        self,
+        training_corpus: List[str],
+        outdir: Path,
+        mapping: LMTokenMapping,
+        tokens: EntityType = None,
+        subword_vocab_size: int = 1000,
+    ) -> None:
        self.corpus = training_corpus
        self.outdir = outdir
        self.prefix = f"{self.outdir}/subword_tokenizer"
-        self.sentencepiece_model = None
-        self.mapping = mapping
        self.tokens = tokens
+        self.mapping = mapping
+        # Train the subword tokenizer
+        self.user_subword_vocab_size = subword_vocab_size
+        self.sentencepiece_model = self.train_subword_tokenizer()

    @property
-    def ner_tokens(self):
-        return [entity.start for entity in self.tokens.values()] + [entity.end for entity in self.tokens.values() if entity.end != ""]
+    def ner_tokens(self) -> List[str]:
+        if self.tokens is None:
+            return []
+        return [entity.start for entity in self.tokens.values()] + [
+            entity.end for entity in self.tokens.values() if entity.end != ""
+        ]

    @property
-    def mapping_tokens(self):
+    def mapping_tokens(self) -> List[str]:
        return [token.encoded for token in self.mapping]
-    
+
    @property
-    def special_tokens(self):        
+    def special_tokens(self) -> List[str]:
        return list(set(self.ner_tokens + self.mapping_tokens))

+    @property
+    def subword_vocab_size(self):
+        n_words = len(set([word for doc in self.corpus for word in doc.split()]))
+        return min(self.user_subword_vocab_size, 3 * n_words)
+
    def train_subword_tokenizer(self):
-        """ 
+        """
        Train a sentencepiece model on the training corpus.
        """
        # Write the corpus in a text file
-        corpus_file = Path(self.outdir / f"tmp_training_corpus.txt")
+        corpus_file = Path(self.outdir / "tmp.txt")
        corpus_file.write_text("\n".join(self.corpus))

        # Train the tokenizer and load it
        logger.info("Training sentencepiece model for subword tokenization")
-        spm.SentencePieceTrainer.train(input=str(corpus_file), vocab_size=1000, model_prefix=self.prefix, user_defined_symbols=self.special_tokens)
+        spm.SentencePieceTrainer.train(
+            input=str(corpus_file),
+            vocab_size=self.subword_vocab_size,
+            model_prefix=self.prefix,
+            user_defined_symbols=self.special_tokens,
+        )

        # Delete the corpus file
        corpus_file.unlink()

-        # Load the corpus
-        self.sentencepiece_model = spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
+        # Load the model and return it
+        return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")

-    def subword_tokenize(self, text: str, enable_sampling=True, alpha=0.1, nbest_size=-1) -> List[str]:
-        """ 
-        Tokenize into subwords. As sampling is enabled, a text can be tokenized in different ways.
+    def subword_tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize into subwords. Sampling is disabled to ensure reproducibility.
        """
-        tokens = self.sentencepiece_model.encode(text, out_type=str, enable_sampling=enable_sampling, alpha=alpha, nbest_size=nbest_size)
-        # Replace special sentencepiece space token 
-        tokens = [t.replace("▁", "⎵") for t in tokens]
+        tokens = self.sentencepiece_model.encode(text, out_type=str)
        # Return encoded tokenized text
        return " ".join(["".join(self.encode(subword)) for subword in tokens])

    def word_tokenize(self, text: str) -> List[str]:
-        """ 
+        """
        Tokenize text into words
-        Spaces (⎵) and NER tokens are considered as distinct words.
+        Spaces (⎵) and NER tokens are considered as words.
        """
        words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)]
-        words = " ".join([word + " ⎵" if (i != len(words) - 1 and word not in self.ner_tokens) else word for i, word in enumerate(words)])
+        words = " ".join(
+            [
+                word + f" {self.mapping.space.encoded}"
+                if (i != len(words) - 1 and word not in self.ner_tokens)
+                else word
+                for i, word in enumerate(words)
+            ]
+        )
        return words
-         
+
    def char_tokenize(self, text: str) -> List[str]:
        """
        Tokenize text into characters
@@ -180,5 +210,7 @@ class Tokenizer():
        return " ".join(self.encode(list(text)))

    def encode(self, text: List[str]) -> List[str]:
+        """
+        Encode special tokens
+        """
        return map(self.mapping.encode_token, text)
-