From 53364dd6b7c1ed9df635203a3ec9204def654dbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com>
Date: Thu, 19 Oct 2023 16:22:01 +0200
Subject: [PATCH] Encode text before checking for unknown characters

---
 dan/datasets/extract/utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py
index 60e597ec..8ee14af3 100644
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -226,9 +226,10 @@ class Tokenizer:
         :param text: Text to be tokenized.
         """
         return " ".join(
-            self.encode(
-                [char if char in self.charset else self.unknown_token for char in text]
-            )
+            [
+                char if char in self.charset else self.unknown_token
+                for char in self.encode(text)
+            ]
         )
 
     def encode(self, text: List[str]) -> List[str]:
-- 
GitLab