diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py
index 2b15cea2ef2b4aa85fd44a1768504a2eb38007f9..ba40bf90e7dd19b06529fe539c455b1f99fa3d69 100644
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -82,8 +82,24 @@ class ArkindexExtractor:
         self.keep_spaces = keep_spaces
         self.subword_vocab_size = subword_vocab_size
 
+        # Loading file from precedent extraction
+        data_path = self.output / "split.json"
+        charset_path = self.output / "charset.pkl"
+
+        is_data_file = data_path.exists()
+        is_charset_file = charset_path.exists()
+
         self.data: Dict = defaultdict(dict)
         self.charset = set()
+
+        if is_data_file and is_charset_file:
+            self.data.update(json.loads(data_path.read_bytes()))
+            self.charset.update(sorted(pickle.loads(charset_path.read_bytes())))
+        elif is_data_file ^ is_charset_file:
+            raise FileNotFoundError(
+                f"The file '{data_path.name}' or `{charset_path.name}` is missing at location {self.output.as_posix()}"
+            )
+
         self.language_corpus = defaultdict(list)
         self.language_tokens = []
         self.language_lexicon = defaultdict(list)
diff --git a/tests/test_extract.py b/tests/test_extract.py
index 308a34c3a54c469ffe2a8161da509f1be5ae9407..368d99984dace2f1de58708b8fc06c340e6151c6 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -229,6 +229,7 @@ def test_process_element_unknown_token_in_text_error(mock_database, tmp_path):
         ),
     ),
 )
+@pytest.mark.parametrize("existing", ((True, False)))
 def test_extract(
     load_entities,
     keep_spaces,
@@ -238,6 +239,7 @@ def test_extract(
     expected_subword_language_corpus,
     subword_vocab_size,
     tmp_path,
+    existing,
 ):
     output = tmp_path / "extraction"
     output.mkdir(parents=True, exist_ok=True)
@@ -250,6 +252,48 @@ def test_extract(
         if token
     ]
 
+    # Add character to fake previous extract file in the folder
+    previous_character = "%"
+
+    if existing:
+        charset_path = output / "charset.pkl"
+        data_path = output / "split.json"
+
+        dataset_type = "train"
+
+        data_id = "train-page_1-line_5"
+
+        data = {
+            "dataset-id": "dataset-id",
+            "image": {
+                "iiif_url": f"{FIXTURES}/extraction/images/text_line/test-page_1-line_1.jpg",
+                "polygon": [
+                    [37, 191],
+                    [37, 339],
+                    [767, 339],
+                    [767, 191],
+                    [37, 191],
+                ],
+            },
+            "text": previous_character,
+        }
+
+        charset_path.write_bytes(pickle.dumps([previous_character]))
+        data_path.write_text(
+            json.dumps(
+                {dataset_type: {data_id: data}},
+            )
+        )
+
+        split_content[dataset_type][data_id] = data
+
+        keys = list(split_content["train"].keys())
+        keys.sort()
+        split_content["train"] = {i: split_content["train"][i] for i in keys}
+
+        # Add 1 to subword_vocab_size because we have one more subword who is {previous_character}
+        subword_vocab_size += 1
+
     extractor = ArkindexExtractor(
         dataset_ids=["dataset_id"],
         element_type=["text_line"],
@@ -264,6 +308,7 @@ def test_extract(
         keep_spaces=keep_spaces,
         subword_vocab_size=subword_vocab_size,
     )
+
     extractor.run()
 
     expected_paths = [
@@ -337,6 +382,17 @@ def test_extract(
 â“¢ Amical â– â“• Eloi â– â“‘ 11 â– . â– 10 â– . â– 04
 â“¢ Biros â– â“• Mael â– â“‘ 30 â– . â– 10 â– . â– 10"""
 
+    if existing:
+        expected_char_language_corpus = (
+            f"{previous_character}\n" + expected_char_language_corpus
+        )
+        expected_word_language_corpus = (
+            f"{previous_character}\n" + expected_word_language_corpus
+        )
+        expected_subword_language_corpus = (
+            f"â– {previous_character}\n" + expected_subword_language_corpus
+        )
+
     # Transcriptions with worker version are in lowercase
     if transcription_entities_worker_version:
         expected_char_language_corpus = expected_char_language_corpus.lower()