Skip to content
Snippets Groups Projects
Commit 98f6b78c authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Merge branch 'extract-train-first' into 'main'

Extract the "train" set first

Closes #248

See merge request !343
parents fab96823 928f327e
No related branches found
No related tags found
1 merge request!343Extract the "train" set first
......@@ -282,6 +282,10 @@ class ArkindexExtractor:
)
continue
# Extract the train set first to correctly build the `self.charset` variable
splits.remove(TRAIN_NAME)
splits.insert(0, TRAIN_NAME)
# Iterate over the subsets to find the page images and labels.
for split in splits:
with tqdm(
......
......@@ -21,7 +21,7 @@ from arkindex_export import (
WorkerVersion,
database,
)
from dan.datasets.extract.arkindex import SPLIT_NAMES
from dan.datasets.extract.arkindex import TEST_NAME, TRAIN_NAME, VAL_NAME
from tests import FIXTURES
......@@ -181,15 +181,16 @@ def mock_database(tmp_path_factory):
)
# Create dataset
split_names = [VAL_NAME, TEST_NAME, TRAIN_NAME]
dataset = Dataset.create(
id="dataset_id",
name="Dataset",
state="complete",
sets=",".join(SPLIT_NAMES),
sets=",".join(split_names),
)
# Create dataset elements
for split in SPLIT_NAMES:
for split in split_names:
element_path = (FIXTURES / "extraction" / "elements" / split).with_suffix(
".json"
)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment