Skip to content
Snippets Groups Projects
Verified Commit 739f5164 authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

add tests and publish docker image with tag POC

parent 02222044
No related branches found
No related tags found
1 merge request!2Implement worker
Pipeline #81804 failed
......@@ -30,3 +30,5 @@ tox
```
To recreate tox virtual environment (e.g. a dependencies update), you may run `tox -r`
Tests use an export from [`IAM | GT | DLA`](https://preprod.arkindex.teklia.com/browse/53919f29-f79c-4c3e-b088-0e955950879f?top_level=true&folder=true).
......@@ -5,6 +5,7 @@
# Will automatically login to a registry if CI_REGISTRY, CI_REGISTRY_USER and CI_REGISTRY_PASSWORD are set.
# Will only push an image if $CI_REGISTRY is set.
VERSION="POC"
if [ -z "$VERSION" ]; then
VERSION=${CI_COMMIT_TAG:-latest}
fi
......
# -*- coding: utf-8 -*-
import os
from pathlib import Path
import pytest
from arkindex.mock import MockApiClient
from arkindex_export import open_database
from arkindex_worker.worker.base import BaseWorker
DATA_PATH = Path(__file__).parent / "data"
@pytest.fixture(autouse=True)
def setup_environment(responses, monkeypatch):
......@@ -26,3 +30,8 @@ def setup_environment(responses, monkeypatch):
# Setup a mock api client instead of using a real one
monkeypatch.setattr(BaseWorker, "setup_api_client", lambda _: MockApiClient())
@pytest.fixture(scope="session", autouse=True)
def arkindex_db():
open_database(DATA_PATH / "arkindex_export.sqlite")
# -*- coding: utf-8 -*-
import tempfile
from argparse import Namespace
from pathlib import Path
def test_dummy():
assert True
from arkindex_worker.cache import (
CachedClassification,
CachedElement,
CachedEntity,
CachedImage,
CachedTranscription,
CachedTranscriptionEntity,
)
from worker_generic_training_dataset.worker import DatasetExtractor
def test_process_split():
# Parent is train folder
parent_id = "a0c4522d-2d80-4766-a01c-b9d686f41f6a"
worker = DatasetExtractor()
# Mock important configuration steps
worker.args = Namespace(dev=False)
worker.initialize_database()
worker.cached_images = dict()
# Where to save the downloaded images
image_folder = Path(tempfile.mkdtemp())
worker.process_split("train", parent_id, image_folder)
# Should have created two pages under root folder
assert (
CachedElement.select().where(CachedElement.parent_id == parent_id).count() == 2
)
# Should have created 8 text_lines under first page
assert (
CachedElement.select()
.where(CachedElement.parent_id == "e26e6803-18da-4768-be30-a0a68132107c")
.count()
== 8
)
# Should have created one classification linked to first page
assert (
CachedClassification.select()
.where(CachedClassification.element == "e26e6803-18da-4768-be30-a0a68132107c")
.count()
== 1
)
# Should have created two images
assert CachedImage.select().count() == 2
assert sorted(image_folder.rglob("*")) == [
image_folder / "80a84b30-1ae1-4c13-95d6-7d0d8ee16c51.jpg",
image_folder / "e3c755f2-0e1c-468e-ae4c-9206f0fd267a.jpg",
]
# Should have created a transcription linked to first line of first page
assert (
CachedTranscription.select()
.where(CachedTranscription.element == "6411b293-dee0-4002-ac82-ffc5cdcb499d")
.count()
== 1
)
# Should have created a transcription entity linked to transcription of first line of first page
assert CachedEntity.select().count() == 1
assert CachedTranscriptionEntity.select().count() == 1
assert CachedEntity.get_by_id("e04b0323-0dda-4f76-b218-af40f7d40c84")
......@@ -25,8 +25,7 @@ def retrieve_element(element_id: str):
def list_classifications(element_id: str):
query = Classification.select().where(Classification.element_id == element_id)
return query
return Classification.select().where(Classification.element_id == element_id)
def parse_transcription(transcription: NamedTuple, element: CachedElement):
......
......@@ -76,11 +76,11 @@ class DatasetExtractor(BaseWorker):
"""
logger.info("Retrieving information from process_information")
train_folder_id = self.config.get("train_folder_id")
train_folder_id = self.process_information.get("train_folder_id")
assert train_folder_id, "A training folder id is necessary to use this worker"
self.training_folder_id = UUID(train_folder_id)
val_folder_id = self.config.get("validation_folder_id")
val_folder_id = self.process_information.get("validation_folder_id")
assert val_folder_id, "A validation folder id is necessary to use this worker"
self.validation_folder_id = UUID(val_folder_id)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment