From 619e1c5cd56603c45a785940d9ebbf517a32a8d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com> Date: Mon, 16 Oct 2023 17:55:34 +0200 Subject: [PATCH] Replace linebreaks with spaces for LM --- dan/datasets/extract/extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py index 9bff74d9..6b4b006e 100644 --- a/dan/datasets/extract/extract.py +++ b/dan/datasets/extract/extract.py @@ -370,7 +370,7 @@ class ArkindexExtractor: ), "Tokens should be single characters." # Build LM corpus - train_corpus = [text for text in self.data["train"].values()] + train_corpus = [text.replace("\n", " ") for text in self.data["train"].values()] tokenizer = Tokenizer( train_corpus, outdir=self.output / "language_model", -- GitLab