diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 60e597eceb191e5c9c5c53e58838f4427dca00b2..8ee14af3685aaeb1842fce56704159eb2bacfa74 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -226,9 +226,10 @@ class Tokenizer: :param text: Text to be tokenized. """ return " ".join( - self.encode( - [char if char in self.charset else self.unknown_token for char in text] - ) + [ + char if char in self.charset else self.unknown_token + for char in self.encode(text) + ] ) def encode(self, text: List[str]) -> List[str]: