diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py
index 9e0706929dfc4de96f20296f775aeb263c6e9617..2befb801b680b89328c99eb714f0e3733802f223 100644
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -14,7 +14,6 @@ import cv2
 import numpy as np
 from PIL import Image
 from tqdm import tqdm
-from nltk.tokenize import wordpunct_tokenize
 
 from arkindex_export import open_database
 from dan.datasets.extract.db import (
@@ -31,12 +30,12 @@ from dan.datasets.extract.exceptions import (
     UnknownTokenInText,
 )
 from dan.datasets.extract.utils import (
+    Tokenizer,
     download_image,
     get_bbox,
     insert_token,
     normalize_linebreaks,
     normalize_spaces,
-    Tokenizer,   
 )
 from dan.utils import EntityType, LMTokenMapping, parse_tokens
 from line_image_extractor.extractor import extract
@@ -371,20 +370,53 @@ class ArkindexExtractor:
 
         # Build LM corpus
         train_corpus = [text for text in self.data["train"].values()]
-        tokenizer = Tokenizer(train_corpus, outdir=self.output / "language_model", mapping=self.mapping, tokens=self.tokens)
-        tokenizer.train_subword_tokenizer()
-        self.language_corpus["characters"] = [tokenizer.char_tokenize(doc) for doc in train_corpus]
-        self.language_corpus["words"] = [tokenizer.word_tokenize(doc) for doc in train_corpus]
-        self.language_corpus["subwords"] = [tokenizer.subword_tokenize(doc) for doc in train_corpus]
+        tokenizer = Tokenizer(
+            train_corpus,
+            outdir=self.output / "language_model",
+            mapping=self.mapping,
+            tokens=self.tokens,
+        )
+        self.language_corpus["characters"] = [
+            tokenizer.char_tokenize(doc) for doc in train_corpus
+        ]
+        self.language_corpus["words"] = [
+            tokenizer.word_tokenize(doc) for doc in train_corpus
+        ]
+        self.language_corpus["subwords"] = [
+            tokenizer.subword_tokenize(doc) for doc in train_corpus
+        ]
 
         # Build vocabulary
-        word_vocabulary = set([word for doc in self.language_corpus["words"] for word in doc.split(" ")])
-        subword_vocabulary = set([subword for doc in self.language_corpus["subwords"] for subword in doc.split(" ")])
+        word_vocabulary = set(
+            [
+                word
+                for doc in self.language_corpus["words"]
+                for word in doc.split()
+                if word != ""
+            ]
+        )
+        subword_vocabulary = set(
+            [
+                subword
+                for doc in self.language_corpus["subwords"]
+                for subword in doc.split()
+                if subword != ""
+            ]
+        )
 
         # Build LM lexicon
-        self.language_lexicon["chars"] = [f"{token} {tokenizer.char_tokenize(token)}" for token in self.language_tokens]
-        self.language_lexicon["words"] = [f"{word} {tokenizer.char_tokenize(word)}" for word in word_vocabulary]
-        self.language_lexicon["subwords"] = [f"{subword} {tokenizer.char_tokenize(subword)}" for subword in subword_vocabulary]
+        self.language_lexicon["characters"] = [
+            f"{token} {token}" for token in self.language_tokens
+        ]
+        self.language_lexicon["words"] = [
+            f"{word} {tokenizer.char_tokenize(word)}"
+            for word in sorted(word_vocabulary)
+            if word != ""
+        ]
+        self.language_lexicon["subwords"] = [
+            f"{subword} {tokenizer.char_tokenize(subword)}"
+            for subword in sorted(subword_vocabulary)
+        ]
 
     def export(self):
         (self.output / "labels.json").write_text(
diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py
index 7b92d773152f1f57336f89b487a04d9c91b6d815..a5f7c4683aaa2d98c1ab6b9c3f56563f2c7ce08b 100644
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -2,9 +2,12 @@
 import logging
 import re
 from io import BytesIO
+from pathlib import Path
 from typing import List
 
 import requests
+import sentencepiece as spm
+from nltk import wordpunct_tokenize
 from PIL import Image, ImageOps
 from tenacity import (
     retry,
@@ -12,10 +15,8 @@ from tenacity import (
     stop_after_attempt,
     wait_exponential,
 )
-from pathlib import Path
-from dan.utils import EntityType
-import sentencepiece as spm
-from nltk import wordpunct_tokenize
+
+from dan.utils import EntityType, LMTokenMapping
 
 logger = logging.getLogger(__name__)
 
@@ -119,68 +120,97 @@ def get_bbox(polygon: List[List[int]]) -> str:
     return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
 
 
-class Tokenizer():
-    """ 
+class Tokenizer:
+    """
     A multi-level tokenizer (char, subword, word)
     Subword tokenizer is trained using sentencepiece.
     """
-    def __init__(self, training_corpus, outdir, mapping, tokens=[]) -> None:
+
+    def __init__(
+        self,
+        training_corpus: List[str],
+        outdir: Path,
+        mapping: LMTokenMapping,
+        tokens: EntityType = None,
+        subword_vocab_size: int = 1000,
+    ) -> None:
         self.corpus = training_corpus
         self.outdir = outdir
         self.prefix = f"{self.outdir}/subword_tokenizer"
-        self.sentencepiece_model = None
-        self.mapping = mapping
         self.tokens = tokens
+        self.mapping = mapping
+        # Train the subword tokenizer
+        self.user_subword_vocab_size = subword_vocab_size
+        self.sentencepiece_model = self.train_subword_tokenizer()
 
     @property
-    def ner_tokens(self):
-        return [entity.start for entity in self.tokens.values()] + [entity.end for entity in self.tokens.values() if entity.end != ""]
+    def ner_tokens(self) -> List[str]:
+        if self.tokens is None:
+            return []
+        return [entity.start for entity in self.tokens.values()] + [
+            entity.end for entity in self.tokens.values() if entity.end != ""
+        ]
 
     @property
-    def mapping_tokens(self):
+    def mapping_tokens(self) -> List[str]:
         return [token.encoded for token in self.mapping]
-    
+
     @property
-    def special_tokens(self):        
+    def special_tokens(self) -> List[str]:
         return list(set(self.ner_tokens + self.mapping_tokens))
 
+    @property
+    def subword_vocab_size(self):
+        n_words = len(set([word for doc in self.corpus for word in doc.split()]))
+        return min(self.user_subword_vocab_size, 3 * n_words)
+
     def train_subword_tokenizer(self):
-        """ 
+        """
         Train a sentencepiece model on the training corpus.
         """
         # Write the corpus in a text file
-        corpus_file = Path(self.outdir / f"tmp_training_corpus.txt")
+        corpus_file = Path(self.outdir / "tmp.txt")
         corpus_file.write_text("\n".join(self.corpus))
 
         # Train the tokenizer and load it
         logger.info("Training sentencepiece model for subword tokenization")
-        spm.SentencePieceTrainer.train(input=str(corpus_file), vocab_size=1000, model_prefix=self.prefix, user_defined_symbols=self.special_tokens)
+        spm.SentencePieceTrainer.train(
+            input=str(corpus_file),
+            vocab_size=self.subword_vocab_size,
+            model_prefix=self.prefix,
+            user_defined_symbols=self.special_tokens,
+        )
 
         # Delete the corpus file
         corpus_file.unlink()
 
-        # Load the corpus
-        self.sentencepiece_model = spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
+        # Load the model and return it
+        return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
 
-    def subword_tokenize(self, text: str, enable_sampling=True, alpha=0.1, nbest_size=-1) -> List[str]:
-        """ 
-        Tokenize into subwords. As sampling is enabled, a text can be tokenized in different ways.
+    def subword_tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize into subwords. Sampling is disabled to ensure reproducibility.
         """
-        tokens = self.sentencepiece_model.encode(text, out_type=str, enable_sampling=enable_sampling, alpha=alpha, nbest_size=nbest_size)
-        # Replace special sentencepiece space token 
-        tokens = [t.replace("▁", "⎵") for t in tokens]
+        tokens = self.sentencepiece_model.encode(text, out_type=str)
         # Return encoded tokenized text
         return " ".join(["".join(self.encode(subword)) for subword in tokens])
 
     def word_tokenize(self, text: str) -> List[str]:
-        """ 
+        """
         Tokenize text into words
-        Spaces (⎵) and NER tokens are considered as distinct words.
+        Spaces (⎵) and NER tokens are considered as words.
         """
         words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)]
-        words = " ".join([word + " ⎵" if (i != len(words) - 1 and word not in self.ner_tokens) else word for i, word in enumerate(words)])
+        words = " ".join(
+            [
+                word + f" {self.mapping.space.encoded}"
+                if (i != len(words) - 1 and word not in self.ner_tokens)
+                else word
+                for i, word in enumerate(words)
+            ]
+        )
         return words
-         
+
     def char_tokenize(self, text: str) -> List[str]:
         """
         Tokenize text into characters
@@ -188,5 +218,7 @@ class Tokenizer():
         return " ".join(self.encode(list(text)))
 
     def encode(self, text: List[str]) -> List[str]:
+        """
+        Encode special tokens
+        """
         return map(self.mapping.encode_token, text)
-