diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py index 9bff74d9cf0f3b5f5a935d542231dc2ebd84e710..6b4b006eb3d5b917233049da75eb1a026e04a663 100644 --- a/dan/datasets/extract/extract.py +++ b/dan/datasets/extract/extract.py @@ -370,7 +370,7 @@ class ArkindexExtractor: ), "Tokens should be single characters." # Build LM corpus - train_corpus = [text for text in self.data["train"].values()] + train_corpus = [text.replace("\n", " ") for text in self.data["train"].values()] tokenizer = Tokenizer( train_corpus, outdir=self.output / "language_model",