Skip to content
Snippets Groups Projects
Commit 98f6b78c authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Merge branch 'extract-train-first' into 'main'

Extract the "train" set first

Closes #248

See merge request !343
parents fab96823 928f327e
No related branches found
No related tags found
1 merge request!343Extract the "train" set first
...@@ -282,6 +282,10 @@ class ArkindexExtractor: ...@@ -282,6 +282,10 @@ class ArkindexExtractor:
) )
continue continue
# Extract the train set first to correctly build the `self.charset` variable
splits.remove(TRAIN_NAME)
splits.insert(0, TRAIN_NAME)
# Iterate over the subsets to find the page images and labels. # Iterate over the subsets to find the page images and labels.
for split in splits: for split in splits:
with tqdm( with tqdm(
......
...@@ -21,7 +21,7 @@ from arkindex_export import ( ...@@ -21,7 +21,7 @@ from arkindex_export import (
WorkerVersion, WorkerVersion,
database, database,
) )
from dan.datasets.extract.arkindex import SPLIT_NAMES from dan.datasets.extract.arkindex import TEST_NAME, TRAIN_NAME, VAL_NAME
from tests import FIXTURES from tests import FIXTURES
...@@ -181,15 +181,16 @@ def mock_database(tmp_path_factory): ...@@ -181,15 +181,16 @@ def mock_database(tmp_path_factory):
) )
# Create dataset # Create dataset
split_names = [VAL_NAME, TEST_NAME, TRAIN_NAME]
dataset = Dataset.create( dataset = Dataset.create(
id="dataset_id", id="dataset_id",
name="Dataset", name="Dataset",
state="complete", state="complete",
sets=",".join(SPLIT_NAMES), sets=",".join(split_names),
) )
# Create dataset elements # Create dataset elements
for split in SPLIT_NAMES: for split in split_names:
element_path = (FIXTURES / "extraction" / "elements" / split).with_suffix( element_path = (FIXTURES / "extraction" / "elements" / split).with_suffix(
".json" ".json"
) )
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment