diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py
index 9bff74d9cf0f3b5f5a935d542231dc2ebd84e710..6b4b006eb3d5b917233049da75eb1a026e04a663 100644
--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -370,7 +370,7 @@ class ArkindexExtractor:
         ), "Tokens should be single characters."
 
         # Build LM corpus
-        train_corpus = [text for text in self.data["train"].values()]
+        train_corpus = [text.replace("\n", " ") for text in self.data["train"].values()]
         tokenizer = Tokenizer(
             train_corpus,
             outdir=self.output / "language_model",