diff --git a/VERSION b/VERSION index 42c72d3b94d94ec0e29f0ed7cbc3caec38a25fb9..b54e0415998e65a659fc260984dc60e63e081a47 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.2.1-beta2 +0.2.2-beta2 diff --git a/arkindex_worker/cache.py b/arkindex_worker/cache.py index a6751c2d09d085118bfd72c4e237b07e8a365bc3..57823b7c75ea8907b832bbdfc1588bede9c88229 100644 --- a/arkindex_worker/cache.py +++ b/arkindex_worker/cache.py @@ -156,6 +156,7 @@ class CachedTranscriptionEntity(Model): entity = ForeignKeyField(CachedEntity, backref="transcription_entities") offset = IntegerField(constraints=[Check("offset >= 0")]) length = IntegerField(constraints=[Check("length > 0")]) + worker_version_id = UUIDField() class Meta: primary_key = CompositeKey("transcription", "entity") diff --git a/arkindex_worker/worker/base.py b/arkindex_worker/worker/base.py index 25e126ce50ab89def4e02162cb778a7783e1a81e..f9f9928ec43d9455b36f4ce42818e2ff9ab042f7 100644 --- a/arkindex_worker/worker/base.py +++ b/arkindex_worker/worker/base.py @@ -62,6 +62,7 @@ class BaseWorker(object): logger.info(f"Worker will use {self.work_dir} as working directory") self.process_information = None + self.user_configuration = None self.support_cache = support_cache # use_cache will be updated in configure() if the cache is supported and if there # is at least one available sqlite database either given or in the parent tasks @@ -160,6 +161,15 @@ class BaseWorker(object): # Load all required secrets self.secrets = {name: self.load_secret(name) for name in required_secrets} + # Load worker run configuration when available and not in dev mode + if os.environ.get("ARKINDEX_WORKER_RUN_ID") and not self.args.dev: + worker_run = self.request( + "RetrieveWorkerRun", id=os.environ["ARKINDEX_WORKER_RUN_ID"] + ) + self.user_configuration = worker_run.get("configuration") + if self.user_configuration: + logger.info("Loaded user configuration from WorkerRun") + task_id = os.environ.get("PONOS_TASK") paths = None if self.support_cache and self.args.database is not None: diff --git a/arkindex_worker/worker/element.py b/arkindex_worker/worker/element.py index b25e8c5160f04aedf99eca997761dba0d5654d0e..f751b19b632416e4b86c695df86db5c2a555e126 100644 --- a/arkindex_worker/worker/element.py +++ b/arkindex_worker/worker/element.py @@ -8,7 +8,34 @@ from arkindex_worker.cache import CachedElement, CachedImage from arkindex_worker.models import Element +class MissingTypeError(Exception): + """ + A required element type was not found in a corpus. + """ + + class ElementMixin(object): + def check_required_types(self, corpus_id: str, *type_slugs: str) -> bool: + """ + Check that a corpus has a list of required element types, + and raise an exception if any of them are missing. + """ + assert len(type_slugs), "At least one element type slug is required." + assert all( + isinstance(slug, str) for slug in type_slugs + ), "Element type slugs must be strings." + + corpus = self.request("RetrieveCorpus", id=corpus_id) + available_slugs = {element_type["slug"] for element_type in corpus["types"]} + missing_slugs = set(type_slugs) - available_slugs + + if missing_slugs: + raise MissingTypeError( + f'Element type(s) {", ".join(sorted(missing_slugs))} were not found in the {corpus["name"]} corpus ({corpus["id"]}).' + ) + + return True + def create_sub_element(self, element, type, name, polygon): """ Create a child element on the given element through API diff --git a/arkindex_worker/worker/entity.py b/arkindex_worker/worker/entity.py index 9992b5491bf8a6a09c425ec37d297a1ed8ec4bd1..7dd5e85d294987701e258fee0869917cc7507128 100644 --- a/arkindex_worker/worker/entity.py +++ b/arkindex_worker/worker/entity.py @@ -21,7 +21,7 @@ class EntityType(Enum): class EntityMixin(object): def create_entity( - self, element, name, type, corpus=None, metas=None, validated=None + self, element, name, type, corpus=None, metas=dict(), validated=None ): """ Create an entity on the given corpus through API @@ -111,6 +111,7 @@ class EntityMixin(object): "entity": entity, "length": length, "offset": offset, + "worker_version_id": self.worker_version_id, }, ) # TODO: Report transcription entity creation @@ -118,15 +119,13 @@ class EntityMixin(object): if self.use_cache: # Store transcription entity in local cache try: - to_insert = [ - { - "transcription": transcription, - "entity": entity, - "offset": offset, - "length": length, - } - ] - CachedTranscriptionEntity.insert_many(to_insert).execute() + CachedTranscriptionEntity.create( + transcription=transcription, + entity=entity, + offset=offset, + length=length, + worker_version_id=self.worker_version_id, + ) except IntegrityError as e: logger.warning( f"Couldn't save created transcription entity in local cache: {e}" diff --git a/requirements.txt b/requirements.txt index 6811fbfd9393d72204d52fbba5648b71a0eb9941..102cb72ee5664d9111a6f721e4e25194137924cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ arkindex-client==1.0.6 peewee==3.14.4 -Pillow==8.2.0 +Pillow==8.3.1 python-gitlab==2.9.0 python-gnupg==0.4.7 sh==1.14.2 diff --git a/tests/test_base_worker.py b/tests/test_base_worker.py index 90953177c24c36db3dc42bd4c97f7bb4bf13cfce..e1d450780ffc14e54def8f9053c9cfe3c65446be 100644 --- a/tests/test_base_worker.py +++ b/tests/test_base_worker.py @@ -119,18 +119,56 @@ def test_cli_arg_verbose_given(mocker, mock_config_api): logger.setLevel(logging.NOTSET) -def test_configure_dev_mode(mocker, mock_user_api, mock_worker_version_api): +def test_configure_dev_mode( + mocker, monkeypatch, mock_user_api, mock_worker_version_api +): """ Configuring a worker in developer mode avoid retrieving process information """ worker = BaseWorker() mocker.patch.object(sys, "argv", ["worker", "--dev"]) + monkeypatch.setenv( + "ARKINDEX_WORKER_RUN_ID", "aaaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" + ) worker.configure() assert worker.args.dev is True assert worker.process_information is None assert worker.worker_version_id == "12341234-1234-1234-1234-123412341234" assert worker.is_read_only is True + assert worker.user_configuration is None + + +def test_configure_worker_run(mocker, monkeypatch, responses, mock_config_api): + worker = BaseWorker() + mocker.patch.object(sys, "argv", ["worker"]) + run_id = "aaaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" + monkeypatch.setenv("ARKINDEX_WORKER_RUN_ID", run_id) + responses.add( + responses.GET, + f"http://testserver/api/v1/imports/workers/{run_id}/", + json={"id": run_id, "configuration": {"a": "b"}}, + ) + worker.configure() + + assert worker.user_configuration == {"a": "b"} + + +def test_configure_worker_run_missing_conf( + mocker, monkeypatch, responses, mock_config_api +): + worker = BaseWorker() + mocker.patch.object(sys, "argv", ["worker"]) + run_id = "aaaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" + monkeypatch.setenv("ARKINDEX_WORKER_RUN_ID", run_id) + responses.add( + responses.GET, + f"http://testserver/api/v1/imports/workers/{run_id}/", + json={"id": run_id}, + ) + worker.configure() + + assert worker.user_configuration is None def test_load_missing_secret(): diff --git a/tests/test_cache.py b/tests/test_cache.py index 127201901a11ee0cea1b29d8536da2cc27fb73e8..b034ed8b8846f732854550b60bfb460773f47e3d 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -58,7 +58,7 @@ def test_create_tables(tmp_path): CREATE TABLE "elements" ("id" TEXT NOT NULL PRIMARY KEY, "parent_id" TEXT, "type" VARCHAR(50) NOT NULL, "image_id" TEXT, "polygon" text, "initial" INTEGER NOT NULL, "worker_version_id" TEXT, FOREIGN KEY ("image_id") REFERENCES "images" ("id")) CREATE TABLE "entities" ("id" TEXT NOT NULL PRIMARY KEY, "type" VARCHAR(50) NOT NULL, "name" TEXT NOT NULL, "validated" INTEGER NOT NULL, "metas" text, "worker_version_id" TEXT NOT NULL) CREATE TABLE "images" ("id" TEXT NOT NULL PRIMARY KEY, "width" INTEGER NOT NULL, "height" INTEGER NOT NULL, "url" TEXT NOT NULL) -CREATE TABLE "transcription_entities" ("transcription_id" TEXT NOT NULL, "entity_id" TEXT NOT NULL, "offset" INTEGER NOT NULL CHECK (offset >= 0), "length" INTEGER NOT NULL CHECK (length > 0), PRIMARY KEY ("transcription_id", "entity_id"), FOREIGN KEY ("transcription_id") REFERENCES "transcriptions" ("id"), FOREIGN KEY ("entity_id") REFERENCES "entities" ("id")) +CREATE TABLE "transcription_entities" ("transcription_id" TEXT NOT NULL, "entity_id" TEXT NOT NULL, "offset" INTEGER NOT NULL CHECK (offset >= 0), "length" INTEGER NOT NULL CHECK (length > 0), "worker_version_id" TEXT NOT NULL, PRIMARY KEY ("transcription_id", "entity_id"), FOREIGN KEY ("transcription_id") REFERENCES "transcriptions" ("id"), FOREIGN KEY ("entity_id") REFERENCES "entities" ("id")) CREATE TABLE "transcriptions" ("id" TEXT NOT NULL PRIMARY KEY, "element_id" TEXT NOT NULL, "text" TEXT NOT NULL, "confidence" REAL NOT NULL, "worker_version_id" TEXT NOT NULL, FOREIGN KEY ("element_id") REFERENCES "elements" ("id"))""" actual_schema = "\n".join( diff --git a/tests/test_elements_worker/test_elements.py b/tests/test_elements_worker/test_elements.py index 370cbb6b06fbe2691f2767c8da7d5580cb6b0482..02c04ca34c83c7020ceec55217f61430f5fd822f 100644 --- a/tests/test_elements_worker/test_elements.py +++ b/tests/test_elements_worker/test_elements.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- import json -import os -import tempfile from argparse import Namespace from uuid import UUID @@ -11,58 +9,102 @@ from apistar.exceptions import ErrorResponse from arkindex_worker.cache import CachedElement, CachedImage from arkindex_worker.models import Element from arkindex_worker.worker import ElementsWorker +from arkindex_worker.worker.element import MissingTypeError from . import BASE_API_CALLS -def test_list_elements_elements_list_arg_wrong_type(monkeypatch, mock_elements_worker): - _, path = tempfile.mkstemp() - with open(path, "w") as f: - json.dump({}, f) +def test_check_required_types_argument_types(mock_elements_worker): + corpus_id = "12341234-1234-1234-1234-123412341234" + worker = ElementsWorker() + + with pytest.raises(AssertionError) as e: + worker.check_required_types(corpus_id) + assert str(e.value) == "At least one element type slug is required." + + with pytest.raises(AssertionError) as e: + worker.check_required_types(corpus_id, "lol", 42) + assert str(e.value) == "Element type slugs must be strings." + - monkeypatch.setenv("TASK_ELEMENTS", path) +def test_check_required_types(monkeypatch, tmp_path, mock_elements_worker, responses): + elements_path = tmp_path / "elements.json" + elements_path.write_text("[]") + monkeypatch.setenv("TASK_ELEMENTS", str(elements_path)) + + corpus_id = "12341234-1234-1234-1234-123412341234" + responses.add( + responses.GET, + f"http://testserver/api/v1/corpus/{corpus_id}/", + json={ + "id": corpus_id, + "name": "Some Corpus", + "types": [{"slug": "folder"}, {"slug": "page"}], + }, + ) + worker = ElementsWorker() + worker.configure() + + assert worker.check_required_types(corpus_id, "page") + assert worker.check_required_types(corpus_id, "page", "folder") + + with pytest.raises(MissingTypeError) as e: + assert worker.check_required_types(corpus_id, "page", "text_line", "act") + assert ( + str(e.value) + == "Element type(s) act, text_line were not found in the Some Corpus corpus (12341234-1234-1234-1234-123412341234)." + ) + + +def test_list_elements_elements_list_arg_wrong_type( + monkeypatch, tmp_path, mock_elements_worker +): + elements_path = tmp_path / "elements.json" + elements_path.write_text("{}") + + monkeypatch.setenv("TASK_ELEMENTS", str(elements_path)) worker = ElementsWorker() worker.configure() - os.unlink(path) with pytest.raises(AssertionError) as e: worker.list_elements() assert str(e.value) == "Elements list must be a list" -def test_list_elements_elements_list_arg_empty_list(monkeypatch, mock_elements_worker): - _, path = tempfile.mkstemp() - with open(path, "w") as f: - json.dump([], f) +def test_list_elements_elements_list_arg_empty_list( + monkeypatch, tmp_path, mock_elements_worker +): + elements_path = tmp_path / "elements.json" + elements_path.write_text("[]") - monkeypatch.setenv("TASK_ELEMENTS", path) + monkeypatch.setenv("TASK_ELEMENTS", str(elements_path)) worker = ElementsWorker() worker.configure() - os.unlink(path) with pytest.raises(AssertionError) as e: worker.list_elements() assert str(e.value) == "No elements in elements list" -def test_list_elements_elements_list_arg_missing_id(monkeypatch, mock_elements_worker): - _, path = tempfile.mkstemp() - with open(path, "w") as f: +def test_list_elements_elements_list_arg_missing_id( + monkeypatch, tmp_path, mock_elements_worker +): + elements_path = tmp_path / "elements.json" + with elements_path.open("w") as f: json.dump([{"type": "volume"}], f) - monkeypatch.setenv("TASK_ELEMENTS", path) + monkeypatch.setenv("TASK_ELEMENTS", str(elements_path)) worker = ElementsWorker() worker.configure() - os.unlink(path) elt_list = worker.list_elements() assert elt_list == [] -def test_list_elements_elements_list_arg(monkeypatch, mock_elements_worker): - _, path = tempfile.mkstemp() - with open(path, "w") as f: +def test_list_elements_elements_list_arg(monkeypatch, tmp_path, mock_elements_worker): + elements_path = tmp_path / "elements.json" + with elements_path.open("w") as f: json.dump( [ {"id": "volumeid", "type": "volume"}, @@ -73,10 +115,9 @@ def test_list_elements_elements_list_arg(monkeypatch, mock_elements_worker): f, ) - monkeypatch.setenv("TASK_ELEMENTS", path) + monkeypatch.setenv("TASK_ELEMENTS", str(elements_path)) worker = ElementsWorker() worker.configure() - os.unlink(path) elt_list = worker.list_elements() @@ -103,9 +144,9 @@ def test_list_elements_element_arg(mocker, mock_elements_worker): assert elt_list == ["volumeid", "pageid"] -def test_list_elements_both_args_error(mocker, mock_elements_worker): - _, path = tempfile.mkstemp() - with open(path, "w") as f: +def test_list_elements_both_args_error(mocker, mock_elements_worker, tmp_path): + elements_path = tmp_path / "elements.json" + with elements_path.open("w") as f: json.dump( [ {"id": "volumeid", "type": "volume"}, @@ -120,7 +161,7 @@ def test_list_elements_both_args_error(mocker, mock_elements_worker): return_value=Namespace( element=["anotherid", "againanotherid"], verbose=False, - elements_list=open(path), + elements_list=elements_path.open(), database=None, dev=False, ), @@ -128,7 +169,6 @@ def test_list_elements_both_args_error(mocker, mock_elements_worker): worker = ElementsWorker() worker.configure() - os.unlink(path) with pytest.raises(AssertionError) as e: worker.list_elements() @@ -847,7 +887,7 @@ def test_create_elements(responses, mock_elements_worker_with_cache, tmp_path): assert created_ids == [{"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08"}] # Check that created elements were properly stored in SQLite cache - assert os.path.isfile(tmp_path / "db.sqlite") + assert (tmp_path / "db.sqlite").is_file() assert list(CachedElement.select()) == [ CachedElement( diff --git a/tests/test_elements_worker/test_entities.py b/tests/test_elements_worker/test_entities.py index 301f2a21bf028e0c6e02a85974ece6bc973d8e30..8b16ba15fe409fe0c19568fac7f3cfb4cdcef83b 100644 --- a/tests/test_elements_worker/test_entities.py +++ b/tests/test_elements_worker/test_entities.py @@ -213,7 +213,7 @@ def test_create_entity(responses, mock_elements_worker): assert json.loads(responses.calls[-1].request.body) == { "name": "Bob Bob", "type": "person", - "metas": None, + "metas": {}, "validated": None, "corpus": "12341234-1234-1234-1234-123412341234", "worker_version": "12341234-1234-1234-1234-123412341234", @@ -247,7 +247,7 @@ def test_create_entity_with_cache(responses, mock_elements_worker_with_cache): assert json.loads(responses.calls[-1].request.body) == { "name": "Bob Bob", "type": "person", - "metas": None, + "metas": {}, "validated": None, "corpus": "12341234-1234-1234-1234-123412341234", "worker_version": "12341234-1234-1234-1234-123412341234", @@ -261,7 +261,7 @@ def test_create_entity_with_cache(responses, mock_elements_worker_with_cache): type="person", name="Bob Bob", validated=False, - metas=None, + metas={}, worker_version_id=UUID("12341234-1234-1234-1234-123412341234"), ) ] @@ -449,6 +449,7 @@ def test_create_transcription_entity(responses, mock_elements_worker): "entity": "11111111-1111-1111-1111-111111111111", "offset": 5, "length": 10, + "worker_version_id": "12341234-1234-1234-1234-123412341234", } @@ -504,6 +505,7 @@ def test_create_transcription_entity_with_cache( "entity": "11111111-1111-1111-1111-111111111111", "offset": 5, "length": 10, + "worker_version_id": "12341234-1234-1234-1234-123412341234", } # Check that created transcription entity was properly stored in SQLite cache @@ -513,5 +515,6 @@ def test_create_transcription_entity_with_cache( entity=UUID("11111111-1111-1111-1111-111111111111"), offset=5, length=10, + worker_version_id=UUID("12341234-1234-1234-1234-123412341234"), ) ] diff --git a/worker-{{cookiecutter.slug}}/requirements.txt b/worker-{{cookiecutter.slug}}/requirements.txt index 11552bee7452f3ecab87e62d8d8b28a014f98e32..0f7c87038f8e73b5fd8e55de874f7dbe9e5103ed 100644 --- a/worker-{{cookiecutter.slug}}/requirements.txt +++ b/worker-{{cookiecutter.slug}}/requirements.txt @@ -1 +1 @@ -arkindex-base-worker==0.2.0 +arkindex-base-worker==0.2.1