diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py
index 8e7be4c44e12c162b3aa883731a9fb198d32415c..e2aa30882248982b4bc8f2fba6fc998d8a7ba170 100644
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -282,6 +282,10 @@ class ArkindexExtractor:
                 )
                 continue
 
+            # Extract the train set first to correctly build the `self.charset` variable
+            splits.remove(TRAIN_NAME)
+            splits.insert(0, TRAIN_NAME)
+
             # Iterate over the subsets to find the page images and labels.
             for split in splits:
                 with tqdm(
diff --git a/tests/conftest.py b/tests/conftest.py
index 4ad5b89cc0facf4f7320e47c152cbfe4bb719bb8..f6f4b36a2cb1fbeacca8550db64958f2a74540bc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -21,7 +21,7 @@ from arkindex_export import (
     WorkerVersion,
     database,
 )
-from dan.datasets.extract.arkindex import SPLIT_NAMES
+from dan.datasets.extract.arkindex import TEST_NAME, TRAIN_NAME, VAL_NAME
 from tests import FIXTURES
 
 
@@ -181,15 +181,16 @@ def mock_database(tmp_path_factory):
     )
 
     # Create dataset
+    split_names = [VAL_NAME, TEST_NAME, TRAIN_NAME]
     dataset = Dataset.create(
         id="dataset_id",
         name="Dataset",
         state="complete",
-        sets=",".join(SPLIT_NAMES),
+        sets=",".join(split_names),
     )
 
     # Create dataset elements
-    for split in SPLIT_NAMES:
+    for split in split_names:
         element_path = (FIXTURES / "extraction" / "elements" / split).with_suffix(
             ".json"
         )