diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py
index 6b13c5d8ff1ece325f4c3d617a8ef856f5c9c979..88975536ce4a725b94e9ecafe3671d0d63d393f3 100644
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -371,8 +371,11 @@ class ArkindexExtractor:
             text.replace(self.mapping.linebreak.display, self.mapping.space.display)
             for text in self.data["train"].values()
         ]
+
         tokenizer = Tokenizer(
             training_corpus=train_corpus,
+            charset=self.language_tokens,
+            unknown_token=self.unknown_token,
             outdir=self.output / "language_model",
             mapping=self.mapping,
             tokens=self.tokens,
diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py
index f0371b6aed7d4912de048f0358e0c372c0bfd7ee..60e597eceb191e5c9c5c53e58838f4427dca00b2 100644
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -131,9 +131,7 @@ def get_vocabulary(tokenized_text: List[str]) -> set[str]:
     Compute set of vocabulary from tokenzied text.
     :param tokenized_text: List of tokenized text.
     """
-    return sorted(
-        set([token for doc in tokenized_text for token in doc.split() if token != ""])
-    )
+    return sorted(set([token for doc in tokenized_text for token in doc.split()]))
 
 
 @dataclass
@@ -148,6 +146,8 @@ class Tokenizer:
     """
 
     training_corpus: List[str]
+    charset: List[str]
+    unknown_token: str
     outdir: Path
     mapping: LMTokenMapping
     tokens: Optional[EntityType] = None
@@ -225,7 +225,11 @@ class Tokenizer:
         Tokenize text into a string of space-separated characters.
         :param text: Text to be tokenized.
         """
-        return " ".join(self.encode(list(text)))
+        return " ".join(
+            self.encode(
+                [char if char in self.charset else self.unknown_token for char in text]
+            )
+        )
 
     def encode(self, text: List[str]) -> List[str]:
         """