Skip to content
Snippets Groups Projects

New DatasetExtractor using a DatasetWorker

Merged Eva Bardou requested to merge dataset-worker into main
All threads resolved!
2 files
+ 16
13
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 14
12
@@ -11,13 +11,11 @@ from arkindex_worker.cache import (
CachedTranscription,
CachedTranscriptionEntity,
)
from worker_generic_training_dataset.db import retrieve_element
from worker_generic_training_dataset.worker import DatasetExtractor
def test_process_split(tmp_path, downloaded_images):
# Parent is train folder
parent_id: UUID = UUID("a0c4522d-2d80-4766-a01c-b9d686f41f6a")
worker = DatasetExtractor()
# Parse some arguments
worker.args = Namespace(database=None)
@@ -27,18 +25,22 @@ def test_process_split(tmp_path, downloaded_images):
# Where to save the downloaded images
worker.image_folder = tmp_path
worker.process_split("train", parent_id)
# Should have created 20 elements in total
assert CachedElement.select().count() == 20
first_page_id = UUID("e26e6803-18da-4768-be30-a0a68132107c")
second_page_id = UUID("c673bd94-96b1-4a2e-8662-a4d806940b5f")
# Should have created two pages under root folder
assert (
CachedElement.select().where(CachedElement.parent_id == parent_id).count() == 2
worker.process_split(
"train",
[
retrieve_element(first_page_id),
retrieve_element(second_page_id),
],
)
first_page_id = UUID("e26e6803-18da-4768-be30-a0a68132107c")
second_page_id = UUID("c673bd94-96b1-4a2e-8662-a4d806940b5f")
# Should have created 20 elements in total
assert CachedElement.select().count() == 19
# Should have created two pages at root
assert CachedElement.select().where(CachedElement.parent_id.is_null()).count() == 2
# Should have created 8 text_lines under first page
assert (
Loading