Skip to content
Snippets Groups Projects
Commit 928f327e authored by Manon Blanco's avatar Manon Blanco
Browse files

Extract the "train" set first

parent fab96823
No related branches found
No related tags found
1 merge request!343Extract the "train" set first
......@@ -282,6 +282,10 @@ class ArkindexExtractor:
)
continue
# Extract the train set first to correctly build the `self.charset` variable
splits.remove(TRAIN_NAME)
splits.insert(0, TRAIN_NAME)
# Iterate over the subsets to find the page images and labels.
for split in splits:
with tqdm(
......
......@@ -21,7 +21,7 @@ from arkindex_export import (
WorkerVersion,
database,
)
from dan.datasets.extract.arkindex import SPLIT_NAMES
from dan.datasets.extract.arkindex import TEST_NAME, TRAIN_NAME, VAL_NAME
from tests import FIXTURES
......@@ -181,15 +181,16 @@ def mock_database(tmp_path_factory):
)
# Create dataset
split_names = [VAL_NAME, TEST_NAME, TRAIN_NAME]
dataset = Dataset.create(
id="dataset_id",
name="Dataset",
state="complete",
sets=",".join(SPLIT_NAMES),
sets=",".join(split_names),
)
# Create dataset elements
for split in SPLIT_NAMES:
for split in split_names:
element_path = (FIXTURES / "extraction" / "elements" / split).with_suffix(
".json"
)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment