diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py
index 6b4b006eb3d5b917233049da75eb1a026e04a663..7478f0190751817fefec72943eb9a00ebd9c4007 100644
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -33,6 +33,7 @@ from dan.datasets.extract.utils import (
     Tokenizer,
     download_image,
     get_bbox,
+    get_vocabulary,
     insert_token,
     normalize_linebreaks,
     normalize_spaces,
@@ -363,14 +364,13 @@ class ArkindexExtractor:
             self.language_tokens.append(
                 self.mapping.encode[token]
             ) if token in self.mapping.encode else self.language_tokens.append(token)
-
         self.language_tokens.append(self.mapping.ctc.encoded)
-        assert all(
-            [len(token) == 1 for token in self.language_lexicon]
-        ), "Tokens should be single characters."
 
         # Build LM corpus
-        train_corpus = [text.replace("\n", " ") for text in self.data["train"].values()]
+        train_corpus = [
+            text.replace(self.mapping.linebreak.display, self.mapping.space.display)
+            for text in self.data["train"].values()
+        ]
         tokenizer = Tokenizer(
             train_corpus,
             outdir=self.output / "language_model",
@@ -388,36 +388,18 @@ class ArkindexExtractor:
             tokenizer.subword_tokenize(doc) for doc in train_corpus
         ]
 
-        # Build vocabulary
-        word_vocabulary = set(
-            [
-                word
-                for doc in self.language_corpus["words"]
-                for word in doc.split()
-                if word != ""
-            ]
-        )
-        subword_vocabulary = set(
-            [
-                subword
-                for doc in self.language_corpus["subwords"]
-                for subword in doc.split()
-                if subword != ""
-            ]
-        )
-
         # Build LM lexicon
         self.language_lexicon["characters"] = [
             f"{token} {token}" for token in self.language_tokens
         ]
         self.language_lexicon["words"] = [
             f"{word} {tokenizer.char_tokenize(word)}"
-            for word in sorted(word_vocabulary)
+            for word in get_vocabulary(self.language_corpus["words"])
             if word != ""
         ]
         self.language_lexicon["subwords"] = [
             f"{subword} {tokenizer.char_tokenize(subword)}"
-            for subword in sorted(subword_vocabulary)
+            for subword in get_vocabulary(self.language_corpus["subwords"])
         ]
 
     def export(self):
diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py
index 9ed48b583bce63590f89930fd9f3cfc33c9b9baa..092257ee10f56ad38920f6bea1d39b0b2c080117 100644
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -122,12 +122,22 @@ def get_bbox(polygon: List[List[int]]) -> str:
     return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
 
 
+def get_vocabulary(tokenized_text: List[str]) -> set[str]:
+    """
+    Compute set of vocabulary from tokenzied text.
+    :param tokenized_text: List of tokenized text.
+    """
+    return sorted(
+        set([token for doc in tokenized_text for token in doc.split() if token != ""])
+    )
+
+
 class Tokenizer:
     """
     A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
     :param training_corpus: List of training text.
     :param outdir: Path to save the subword tokenizer.
-    :param mapping: Mapping between displayed and encoded versions of special characters
+    :param mapping: Mapping between displayed and encoded versions of special characters.
     :param tokens: Start and end tokens used to represent named entities.
     :param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
     """