From 619e1c5cd56603c45a785940d9ebbf517a32a8d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com>
Date: Mon, 16 Oct 2023 17:55:34 +0200
Subject: [PATCH] Replace linebreaks with spaces for LM

---
 dan/datasets/extract/extract.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py
index 9bff74d9..6b4b006e 100644
--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -370,7 +370,7 @@ class ArkindexExtractor:
         ), "Tokens should be single characters."
 
         # Build LM corpus
-        train_corpus = [text for text in self.data["train"].values()]
+        train_corpus = [text.replace("\n", " ") for text in self.data["train"].values()]
         tokenizer = Tokenizer(
             train_corpus,
             outdir=self.output / "language_model",
-- 
GitLab