From e24f36c58a6f48fe9484793298f61efc7b7297d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com>
Date: Mon, 23 Oct 2023 08:55:23 +0200
Subject: [PATCH] Add unknown token to charset

---
 dan/datasets/extract/arkindex.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py
index 62a4c620..e3ae0763 100644
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -355,6 +355,8 @@ class ArkindexExtractor:
         Convert charset to a LM-compatible charset. Ensure that special LM tokens do not appear in the charset.
         """
         logger.info("Preparing language resources")
+        # Add unknown token to charset
+        self.charset.add(self.unknown_token)
 
         # Build LM tokens
         for token in sorted(list(self.charset)):
-- 
GitLab