From 928f327ebf99ee2c1b3ef7c015717953748633f3 Mon Sep 17 00:00:00 2001
From: manonBlanco <blanco@teklia.com>
Date: Tue, 9 Jan 2024 17:01:53 +0100
Subject: [PATCH] Extract the "train" set first

---
 dan/datasets/extract/arkindex.py | 4 ++++
 tests/conftest.py                | 7 ++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py
index 8e7be4c4..e2aa3088 100644
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -282,6 +282,10 @@ class ArkindexExtractor:
                 )
                 continue
 
+            # Extract the train set first to correctly build the `self.charset` variable
+            splits.remove(TRAIN_NAME)
+            splits.insert(0, TRAIN_NAME)
+
             # Iterate over the subsets to find the page images and labels.
             for split in splits:
                 with tqdm(
diff --git a/tests/conftest.py b/tests/conftest.py
index 4ad5b89c..f6f4b36a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -21,7 +21,7 @@ from arkindex_export import (
     WorkerVersion,
     database,
 )
-from dan.datasets.extract.arkindex import SPLIT_NAMES
+from dan.datasets.extract.arkindex import TEST_NAME, TRAIN_NAME, VAL_NAME
 from tests import FIXTURES
 
 
@@ -181,15 +181,16 @@ def mock_database(tmp_path_factory):
     )
 
     # Create dataset
+    split_names = [VAL_NAME, TEST_NAME, TRAIN_NAME]
     dataset = Dataset.create(
         id="dataset_id",
         name="Dataset",
         state="complete",
-        sets=",".join(SPLIT_NAMES),
+        sets=",".join(split_names),
     )
 
     # Create dataset elements
-    for split in SPLIT_NAMES:
+    for split in split_names:
         element_path = (FIXTURES / "extraction" / "elements" / split).with_suffix(
             ".json"
         )
-- 
GitLab