From 53364dd6b7c1ed9df635203a3ec9204def654dbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com> Date: Thu, 19 Oct 2023 16:22:01 +0200 Subject: [PATCH] Encode text before checking for unknown characters --- dan/datasets/extract/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 60e597ec..8ee14af3 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -226,9 +226,10 @@ class Tokenizer: :param text: Text to be tokenized. """ return " ".join( - self.encode( - [char if char in self.charset else self.unknown_token for char in text] - ) + [ + char if char in self.charset else self.unknown_token + for char in self.encode(text) + ] ) def encode(self, text: List[str]) -> List[str]: -- GitLab