diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index 88975536ce4a725b94e9ecafe3671d0d63d393f3..62a4c620c53d998e6436bce04d0fb8a3c837ac21 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -365,6 +365,7 @@ class ArkindexExtractor: self.mapping.encode[token] ) if token in self.mapping.encode else self.language_tokens.append(token) self.language_tokens.append(self.mapping.ctc.encoded) + self.language_tokens.append(self.unknown_token) # Build LM corpus train_corpus = [ diff --git a/tests/test_extract.py b/tests/test_extract.py index ce4e13587ae8479f14778b3eb73484eb7197bd7d..0f5207862b3229b2f48376fbda5206015c905813 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -654,6 +654,7 @@ def test_extract( "â–" if t.isspace() else t for t in sorted(list(expected_charset)) ] expected_language_tokens.append("â—Œ") + expected_language_tokens.append("â‡") assert (output / "language_model" / "tokens.txt").read_text() == "\n".join( expected_language_tokens )