From d15506a49a4e8aef1da8a5d9b2b60d4f90284911 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com> Date: Fri, 20 Oct 2023 17:47:16 +0200 Subject: [PATCH] Add the unknown character to the list of tokens --- dan/datasets/extract/extract.py | 1 + tests/test_extract.py | 1 + 2 files changed, 2 insertions(+) diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py index 88975536..62a4c620 100644 --- a/dan/datasets/extract/extract.py +++ b/dan/datasets/extract/extract.py @@ -365,6 +365,7 @@ class ArkindexExtractor: self.mapping.encode[token] ) if token in self.mapping.encode else self.language_tokens.append(token) self.language_tokens.append(self.mapping.ctc.encoded) + self.language_tokens.append(self.unknown_token) # Build LM corpus train_corpus = [ diff --git a/tests/test_extract.py b/tests/test_extract.py index 120cd788..ca7b0e56 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -654,6 +654,7 @@ def test_extract( "â–" if t.isspace() else t for t in sorted(list(expected_charset)) ] expected_language_tokens.append("â—Œ") + expected_language_tokens.append("â‡") assert (output / "language_model" / "tokens.txt").read_text() == "\n".join( expected_language_tokens ) -- GitLab