add tests and publish docker image with tag POC

739f5164 · Yoann Schneider · 02222044 · 739f5164 · 739f5164 · 739f5164
Verified Commit 739f5164 authored 1 year ago by Yoann Schneider
--- a/README.md
+++ b/README.md
@@ -30,3 +30,5 @@ tox
 ```

 To recreate tox virtual environment (e.g. a dependencies update), you may run `tox -r`
+
+Tests use an export from [`IAM | GT | DLA`](https://preprod.arkindex.teklia.com/browse/53919f29-f79c-4c3e-b088-0e955950879f?top_level=true&folder=true).
--- a/ci/build.sh
+++ b/ci/build.sh
@@ -5,6 +5,7 @@
 # Will automatically login to a registry if CI_REGISTRY, CI_REGISTRY_USER and CI_REGISTRY_PASSWORD are set.
 # Will only push an image if $CI_REGISTRY is set.

+VERSION="POC"
 if [ -z "$VERSION" ]; then
 	VERSION=${CI_COMMIT_TAG:-latest}
 fi

--- a/tests/conftest.py
+++ b/tests/conftest.py
 # -*- coding: utf-8 -*-
 import os
+from pathlib import Path

 import pytest

 from arkindex.mock import MockApiClient
+from arkindex_export import open_database
 from arkindex_worker.worker.base import BaseWorker

+DATA_PATH = Path(__file__).parent / "data"
+

 @pytest.fixture(autouse=True)
 def setup_environment(responses, monkeypatch):
@@ -26,3 +30,8 @@ def setup_environment(responses, monkeypatch):

    # Setup a mock api client instead of using a real one
    monkeypatch.setattr(BaseWorker, "setup_api_client", lambda _: MockApiClient())
+
+
+@pytest.fixture(scope="session", autouse=True)
+def arkindex_db():
+    open_database(DATA_PATH / "arkindex_export.sqlite")
--- a/tests/test_worker.py
+++ b/tests/test_worker.py
 # -*- coding: utf-8 -*-

+import tempfile
+from argparse import Namespace
+from pathlib import Path

-def test_dummy():
-    assert True
+from arkindex_worker.cache import (
+    CachedClassification,
+    CachedElement,
+    CachedEntity,
+    CachedImage,
+    CachedTranscription,
+    CachedTranscriptionEntity,
+)
+from worker_generic_training_dataset.worker import DatasetExtractor
+
+
+def test_process_split():
+    # Parent is train folder
+    parent_id = "a0c4522d-2d80-4766-a01c-b9d686f41f6a"
+
+    worker = DatasetExtractor()
+    # Mock important configuration steps
+    worker.args = Namespace(dev=False)
+    worker.initialize_database()
+    worker.cached_images = dict()
+
+    # Where to save the downloaded images
+    image_folder = Path(tempfile.mkdtemp())
+
+    worker.process_split("train", parent_id, image_folder)
+
+    # Should have created two pages under root folder
+    assert (
+        CachedElement.select().where(CachedElement.parent_id == parent_id).count() == 2
+    )
+
+    # Should have created 8 text_lines under first page
+    assert (
+        CachedElement.select()
+        .where(CachedElement.parent_id == "e26e6803-18da-4768-be30-a0a68132107c")
+        .count()
+        == 8
+    )
+
+    # Should have created one classification linked to first page
+    assert (
+        CachedClassification.select()
+        .where(CachedClassification.element == "e26e6803-18da-4768-be30-a0a68132107c")
+        .count()
+        == 1
+    )
+
+    # Should have created two images
+    assert CachedImage.select().count() == 2
+    assert sorted(image_folder.rglob("*")) == [
+        image_folder / "80a84b30-1ae1-4c13-95d6-7d0d8ee16c51.jpg",
+        image_folder / "e3c755f2-0e1c-468e-ae4c-9206f0fd267a.jpg",
+    ]
+
+    # Should have created a transcription linked to first line of first page
+    assert (
+        CachedTranscription.select()
+        .where(CachedTranscription.element == "6411b293-dee0-4002-ac82-ffc5cdcb499d")
+        .count()
+        == 1
+    )
+
+    # Should have created a transcription entity linked to transcription of first line of first page
+    assert CachedEntity.select().count() == 1
+    assert CachedTranscriptionEntity.select().count() == 1
+
+    assert CachedEntity.get_by_id("e04b0323-0dda-4f76-b218-af40f7d40c84")
--- a/worker_generic_training_dataset/db.py
+++ b/worker_generic_training_dataset/db.py
@@ -25,8 +25,7 @@ def retrieve_element(element_id: str):


 def list_classifications(element_id: str):
-    query = Classification.select().where(Classification.element_id == element_id)
-    return query
+    return Classification.select().where(Classification.element_id == element_id)


 def parse_transcription(transcription: NamedTuple, element: CachedElement):

--- a/worker_generic_training_dataset/worker.py
+++ b/worker_generic_training_dataset/worker.py
@@ -76,11 +76,11 @@ class DatasetExtractor(BaseWorker):
        """
        logger.info("Retrieving information from process_information")

-        train_folder_id = self.config.get("train_folder_id")
+        train_folder_id = self.process_information.get("train_folder_id")
        assert train_folder_id, "A training folder id is necessary to use this worker"
        self.training_folder_id = UUID(train_folder_id)

-        val_folder_id = self.config.get("validation_folder_id")
+        val_folder_id = self.process_information.get("validation_folder_id")
        assert val_folder_id, "A validation folder id is necessary to use this worker"
        self.validation_folder_id = UUID(val_folder_id)