diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py index 88975536ce4a725b94e9ecafe3671d0d63d393f3..62a4c620c53d998e6436bce04d0fb8a3c837ac21 100644 --- a/dan/datasets/extract/extract.py +++ b/dan/datasets/extract/extract.py @@ -365,6 +365,7 @@ class ArkindexExtractor: self.mapping.encode[token] ) if token in self.mapping.encode else self.language_tokens.append(token) self.language_tokens.append(self.mapping.ctc.encoded) + self.language_tokens.append(self.unknown_token) # Build LM corpus train_corpus = [ diff --git a/tests/test_extract.py b/tests/test_extract.py index 120cd78838c20c756769fc8bf782289612749967..ca7b0e56daf22b7ab164293fae48a3c8f7b7023a 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -654,6 +654,7 @@ def test_extract( "â–" if t.isspace() else t for t in sorted(list(expected_charset)) ] expected_language_tokens.append("â—Œ") + expected_language_tokens.append("â‡") assert (output / "language_model" / "tokens.txt").read_text() == "\n".join( expected_language_tokens )