diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py
index 5cd8f5333f3043cb97fe9113e7c96c0731e15b1e..9ed48b583bce63590f89930fd9f3cfc33c9b9baa 100644
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -124,8 +124,12 @@ def get_bbox(polygon: List[List[int]]) -> str:
 
 class Tokenizer:
     """
-    A multi-level tokenizer (char, subword, word)
-    Subword tokenizer is trained using sentencepiece.
+    A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
+    :param training_corpus: List of training text.
+    :param outdir: Path to save the subword tokenizer.
+    :param mapping: Mapping between displayed and encoded versions of special characters
+    :param tokens: Start and end tokens used to represent named entities.
+    :param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
     """
 
     def __init__(
@@ -169,7 +173,7 @@ class Tokenizer:
         corpus_file = Path(self.outdir / "tmp.txt")
         corpus_file.write_text("\n".join(self.corpus))
 
-        # Train the tokenizer and load it
+        # Train the tokenizer
         logger.info("Training sentencepiece model for subword tokenization")
         spm.SentencePieceTrainer.train(
             input=str(corpus_file),
@@ -178,27 +182,24 @@ class Tokenizer:
             user_defined_symbols=self.special_tokens,
         )
 
-        # Delete the corpus file
+        # Delete the corpus file and load the model
         corpus_file.unlink()
-
-        # Load the model and return it
         return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
 
-    def subword_tokenize(self, text: str) -> List[str]:
+    def subword_tokenize(self, text: str) -> str:
         """
         Tokenize into subwords. Sampling is disabled to ensure reproducibility.
         """
         tokens = self.sentencepiece_model.encode(text, out_type=str)
-        # Return encoded tokenized text
         return " ".join(["".join(self.encode(subword)) for subword in tokens])
 
-    def word_tokenize(self, text: str) -> List[str]:
+    def word_tokenize(self, text: str) -> str:
         """
-        Tokenize text into words
-        Spaces (âŽµ) and NER tokens are considered as words.
+        Tokenize text into a string of space-separated words. Spaces (âŽµ) and NER tokens are considered as words.
+        :param text: Text to be tokenized.
         """
         words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)]
-        words = " ".join(
+        return " ".join(
             [
                 word + f" {self.mapping.space.encoded}"
                 if (i != len(words) - 1 and word not in self.ner_tokens)
@@ -206,16 +207,17 @@ class Tokenizer:
                 for i, word in enumerate(words)
             ]
         )
-        return words
 
-    def char_tokenize(self, text: str) -> List[str]:
+    def char_tokenize(self, text: str) -> str:
         """
-        Tokenize text into characters
+        Tokenize text into a string of space-separated characters.
+        :param text: Text to be tokenized.
         """
         return " ".join(self.encode(list(text)))
 
     def encode(self, text: List[str]) -> List[str]:
         """
-        Encode special tokens
+        Encode special tokens.
+        :param text: Text to be encoded.
         """
         return map(self.mapping.encode_token, text)