diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index 8e7be4c44e12c162b3aa883731a9fb198d32415c..e2aa30882248982b4bc8f2fba6fc998d8a7ba170 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -282,6 +282,10 @@ class ArkindexExtractor: ) continue + # Extract the train set first to correctly build the `self.charset` variable + splits.remove(TRAIN_NAME) + splits.insert(0, TRAIN_NAME) + # Iterate over the subsets to find the page images and labels. for split in splits: with tqdm( diff --git a/tests/conftest.py b/tests/conftest.py index 4ad5b89cc0facf4f7320e47c152cbfe4bb719bb8..f6f4b36a2cb1fbeacca8550db64958f2a74540bc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,7 +21,7 @@ from arkindex_export import ( WorkerVersion, database, ) -from dan.datasets.extract.arkindex import SPLIT_NAMES +from dan.datasets.extract.arkindex import TEST_NAME, TRAIN_NAME, VAL_NAME from tests import FIXTURES @@ -181,15 +181,16 @@ def mock_database(tmp_path_factory): ) # Create dataset + split_names = [VAL_NAME, TEST_NAME, TRAIN_NAME] dataset = Dataset.create( id="dataset_id", name="Dataset", state="complete", - sets=",".join(SPLIT_NAMES), + sets=",".join(split_names), ) # Create dataset elements - for split in SPLIT_NAMES: + for split in split_names: element_path = (FIXTURES / "extraction" / "elements" / split).with_suffix( ".json" )