diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index 62a4c620c53d998e6436bce04d0fb8a3c837ac21..e3ae07637aa3fe33a00b9a9717fe020e65b580ba 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -355,6 +355,8 @@ class ArkindexExtractor: Convert charset to a LM-compatible charset. Ensure that special LM tokens do not appear in the charset. """ logger.info("Preparing language resources") + # Add unknown token to charset + self.charset.add(self.unknown_token) # Build LM tokens for token in sorted(list(self.charset)):