From c23b608ad5ab501b0708a36939f699b9fec3f3d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com> Date: Fri, 20 Oct 2023 17:47:16 +0200 Subject: [PATCH] Add the unknown character to the list of tokens --- dan/datasets/extract/arkindex.py | 1 + tests/test_extract.py | 1 + 2 files changed, 2 insertions(+) diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index 88975536..62a4c620 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -365,6 +365,7 @@ class ArkindexExtractor: self.mapping.encode[token] ) if token in self.mapping.encode else self.language_tokens.append(token) self.language_tokens.append(self.mapping.ctc.encoded) + self.language_tokens.append(self.unknown_token) # Build LM corpus train_corpus = [ diff --git a/tests/test_extract.py b/tests/test_extract.py index ce4e1358..0f520786 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -654,6 +654,7 @@ def test_extract( "â–" if t.isspace() else t for t in sorted(list(expected_charset)) ] expected_language_tokens.append("â—Œ") + expected_language_tokens.append("â‡") assert (output / "language_model" / "tokens.txt").read_text() == "\n".join( expected_language_tokens ) -- GitLab