diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py
index 62a4c620c53d998e6436bce04d0fb8a3c837ac21..e3ae07637aa3fe33a00b9a9717fe020e65b580ba 100644
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -355,6 +355,8 @@ class ArkindexExtractor:
         Convert charset to a LM-compatible charset. Ensure that special LM tokens do not appear in the charset.
         """
         logger.info("Preparing language resources")
+        # Add unknown token to charset
+        self.charset.add(self.unknown_token)
 
         # Build LM tokens
         for token in sorted(list(self.charset)):