Solene Tarride
--- a/dan/datasets/extract/utils.py

+ 18

− 16
+++ b/dan/datasets/extract/utils.py

+ 18

− 16
 @@ -124,8 +124,12 @@ def get_bbox(polygon: List[List[int]]) -> str:

 class Tokenizer:
    """
-    A multi-level tokenizer (char, subword, word)
-    Subword tokenizer is trained using sentencepiece.
+    A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
+    :param training_corpus: List of training text.
+    :param outdir: Path to save the subword tokenizer.
+    :param mapping: Mapping between displayed and encoded versions of special characters
+    :param tokens: Start and end tokens used to represent named entities.
+    :param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
    """

    def __init__(
 @@ -169,7 +173,7 @@ class Tokenizer:
        corpus_file = Path(self.outdir / "tmp.txt")
        corpus_file.write_text("\n".join(self.corpus))

-        # Train the tokenizer and load it
+        # Train the tokenizer
        logger.info("Training sentencepiece model for subword tokenization")
        spm.SentencePieceTrainer.train(
            input=str(corpus_file),
 @@ -178,27 +182,24 @@ class Tokenizer:
            user_defined_symbols=self.special_tokens,
        )

-        # Delete the corpus file
+        # Delete the corpus file and load the model
        corpus_file.unlink()
-
-        # Load the model and return it
        return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")

-    def subword_tokenize(self, text: str) -> List[str]:
+    def subword_tokenize(self, text: str) -> str:
        """
        Tokenize into subwords. Sampling is disabled to ensure reproducibility.
        """
        tokens = self.sentencepiece_model.encode(text, out_type=str)
-        # Return encoded tokenized text
        return " ".join(["".join(self.encode(subword)) for subword in tokens])

-    def word_tokenize(self, text: str) -> List[str]:
+    def word_tokenize(self, text: str) -> str:
        """
-        Tokenize text into words
-        Spaces (⎵) and NER tokens are considered as words.
+        Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words.
+        :param text: Text to be tokenized.
        """
        words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)]
-        words = " ".join(
+        return " ".join(
            [
                word + f" {self.mapping.space.encoded}"
                if (i != len(words) - 1 and word not in self.ner_tokens)
 @@ -206,16 +207,17 @@ class Tokenizer:
                for i, word in enumerate(words)
            ]
        )
-        return words

-    def char_tokenize(self, text: str) -> List[str]:
+    def char_tokenize(self, text: str) -> str:
        """
-        Tokenize text into characters
+        Tokenize text into a string of space-separated characters.
+        :param text: Text to be tokenized.
        """
        return " ".join(self.encode(list(text)))

    def encode(self, text: List[str]) -> List[str]:
        """
-        Encode special tokens
+        Encode special tokens.
+        :param text: Text to be encoded.
        """
        return map(self.mapping.encode_token, text)