From 928f327ebf99ee2c1b3ef7c015717953748633f3 Mon Sep 17 00:00:00 2001 From: manonBlanco <blanco@teklia.com> Date: Tue, 9 Jan 2024 17:01:53 +0100 Subject: [PATCH] Extract the "train" set first --- dan/datasets/extract/arkindex.py | 4 ++++ tests/conftest.py | 7 ++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index 8e7be4c4..e2aa3088 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -282,6 +282,10 @@ class ArkindexExtractor: ) continue + # Extract the train set first to correctly build the `self.charset` variable + splits.remove(TRAIN_NAME) + splits.insert(0, TRAIN_NAME) + # Iterate over the subsets to find the page images and labels. for split in splits: with tqdm( diff --git a/tests/conftest.py b/tests/conftest.py index 4ad5b89c..f6f4b36a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,7 +21,7 @@ from arkindex_export import ( WorkerVersion, database, ) -from dan.datasets.extract.arkindex import SPLIT_NAMES +from dan.datasets.extract.arkindex import TEST_NAME, TRAIN_NAME, VAL_NAME from tests import FIXTURES @@ -181,15 +181,16 @@ def mock_database(tmp_path_factory): ) # Create dataset + split_names = [VAL_NAME, TEST_NAME, TRAIN_NAME] dataset = Dataset.create( id="dataset_id", name="Dataset", state="complete", - sets=",".join(SPLIT_NAMES), + sets=",".join(split_names), ) # Create dataset elements - for split in SPLIT_NAMES: + for split in split_names: element_path = (FIXTURES / "extraction" / "elements" / split).with_suffix( ".json" ) -- GitLab