Solene Tarride · Solene Tarride
--- a/dan/datasets/extract/utils.py

+ 123

− 2
+++ b/dan/datasets/extract/utils.py

+ 123

− 2
 # -*- coding: utf-8 -*-
+import itertools
 import logging
+import operator
 import re
+from dataclasses import dataclass, field
 from io import BytesIO
-from typing import List
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from typing import Iterator, List, Optional, Union

 import requests
+import sentencepiece as spm
+from nltk import wordpunct_tokenize
 from PIL import Image, ImageOps
 from tenacity import (
    retry,
 @@ -13,7 +20,7 @@ from tenacity import (
    wait_exponential,
 )

-from dan.utils import EntityType
+from dan.utils import EntityType, LMTokenMapping

 logger = logging.getLogger(__name__)

 @@ -117,3 +124,117 @@ def get_bbox(polygon: List[List[int]]) -> str:
    x, y = min(all_x), min(all_y)
    width, height = max(all_x) - x, max(all_y) - y
    return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
+
+
+def get_vocabulary(tokenized_text: List[str]) -> set[str]:
+    """
+    Compute set of vocabulary from tokenzied text.
+    :param tokenized_text: List of tokenized text.
+    """
+    return sorted(set([token for doc in tokenized_text for token in doc.split()]))
+
+
+@dataclass
+class Tokenizer:
+    """
+    A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
+    :param training_corpus: List of training text.
+    :param outdir: Path to save the subword tokenizer.
+    :param mapping: Mapping between displayed and encoded versions of special characters.
+    :param tokens: Start and end tokens used to represent named entities.
+    :param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
+    """
+
+    training_corpus: List[str]
+    charset: List[str]
+    unknown_token: str
+    outdir: Path
+    mapping: LMTokenMapping
+    tokens: Optional[EntityType] = None
+    subword_vocab_size: int = 1000
+    sentencepiece_model: spm.SentencePieceProcessor = field(init=False)
+
+    @property
+    def prefix(self):
+        return self.outdir / "subword_tokenizer"
+
+    @property
+    def ner_tokens(self) -> Union[List[str], Iterator[str]]:
+        if self.tokens is None:
+            return []
+        return itertools.chain(
+            map(operator.attrgetter("start"), self.tokens.values()),
+            filter(
+                operator.truth, map(operator.attrgetter("end"), self.tokens.values())
+            ),
+        )
+
+    @property
+    def mapping_tokens(self) -> List[str]:
+        return [token.encoded for token in self.mapping]
+
+    @property
+    def special_tokens(self) -> List[str]:
+        return list(set(itertools.chain(self.mapping_tokens, self.ner_tokens)))
+
+    def __post_init__(self) -> None:
+        """
+        Train a sentencepiece model on the training corpus.
+        """
+        # Write the corpus in a text file
+        logger.info("Training a sentencepiece model for subword tokenization")
+        with NamedTemporaryFile(dir=self.outdir, suffix=".txt", mode="w") as tmp:
+            tmp.write("\n".join(self.training_corpus))
+            tmp.flush()
+            spm.SentencePieceTrainer.train(
+                input=tmp.name,
+                vocab_size=self.subword_vocab_size,
+                model_prefix=self.prefix,
+                user_defined_symbols=self.special_tokens,
+            )
+
+        # Load the model
+        self.sentencepiece_model = spm.SentencePieceProcessor(
+            model_file=str(self.prefix.with_suffix(".model"))
+        )
+
+    def subword_tokenize(self, text: str) -> str:
+        """
+        Tokenize into subwords. Sampling is disabled to ensure reproducibility.
+        """
+        tokens = self.sentencepiece_model.encode(text, out_type=str)
+        return " ".join(map("".join, map(self.encode, tokens)))
+
+    def word_tokenize(self, text: str) -> str:
+        """
+        Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words.
+        :param text: Text to be tokenized.
+        """
+        words = list(map("".join, map(self.encode, wordpunct_tokenize(text))))
+        return " ".join(
+            [
+                word + f" {self.mapping.space.encoded}"
+                if (i != len(words) - 1 and word not in self.ner_tokens)
+                else word
+                for i, word in enumerate(words)
+            ]
+        )
+
+    def char_tokenize(self, text: str) -> str:
+        """
+        Tokenize text into a string of space-separated characters.
+        :param text: Text to be tokenized.
+        """
+        return " ".join(
+            [
+                char if char in self.charset else self.unknown_token
+                for char in self.encode(text)
+            ]
+        )
+
+    def encode(self, text: List[str]) -> List[str]:
+        """
+        Encode special tokens.
+        :param text: Text to be encoded.
+        """
+        return map(self.mapping.encode_token, text)