Compare revisions

Yoann Schneider · Bastien Abadie · Yoann Schneider · Bastien Abadie · Yoann Schneider · Bastien Abadie
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,7 +30,7 @@ repos:
      - id: trailing-whitespace
      - id: check-yaml
        args: [--allow-multiple-documents]
-        exclude: "^worker-{{cookiecutter.slug}}/.arkindex.yml$"
+        exclude: "^worker-{{cookiecutter.slug}}/.arkindex.yml$|^mkdocs.yml$"
      - id: mixed-line-ending
      - id: name-tests-test
        args: ['--django']

--- a/VERSION
+++ b/VERSION
-0.3.2-rc5
+0.3.2-rc6
--- a/arkindex_worker/worker/__init__.py
+++ b/arkindex_worker/worker/__init__.py
@@ -19,7 +19,7 @@ from arkindex_worker.reporting import Reporter
 from arkindex_worker.worker.base import BaseWorker
 from arkindex_worker.worker.classification import ClassificationMixin
 from arkindex_worker.worker.element import ElementMixin
-from arkindex_worker.worker.entity import EntityMixin, EntityType  # noqa: F401
+from arkindex_worker.worker.entity import EntityMixin  # noqa: F401
 from arkindex_worker.worker.metadata import MetaDataMixin, MetaType  # noqa: F401
 from arkindex_worker.worker.transcription import TranscriptionMixin
 from arkindex_worker.worker.version import WorkerVersionMixin  # noqa: F401
@@ -92,6 +92,10 @@ class ElementsWorker(

        self.classes = {}

+        self.entity_types = {}
+        """Known and available entity types in processed corpus
+        """
+
        self._worker_version_cache = {}

    def list_elements(self) -> Union[Iterable[CachedElement], List[str]]:

--- a/arkindex_worker/worker/classification.py
+++ b/arkindex_worker/worker/classification.py
@@ -23,10 +23,10 @@ class ClassificationMixin(object):
            "ListCorpusMLClasses",
            id=self.corpus_id,
        )
-        self.classes[self.corpus_id] = {
-            ml_class["name"]: ml_class["id"] for ml_class in corpus_classes
-        }
-        logger.info(f"Loaded {len(self.classes[self.corpus_id])} ML classes")
+        self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
+        logger.info(
+            f"Loaded {len(self.classes)} ML classes in corpus ({self.corpus_id})"
+        )

    def get_ml_class_id(self, ml_class: str) -> str:
        """
@@ -36,17 +36,17 @@ class ClassificationMixin(object):
        :param ml_class: Name of the MLClass.
        :returns: ID of the retrieved or created MLClass.
        """
-        if not self.classes.get(self.corpus_id):
+        if not self.classes:
            self.load_corpus_classes()

-        ml_class_id = self.classes[self.corpus_id].get(ml_class)
+        ml_class_id = self.classes.get(ml_class)
        if ml_class_id is None:
            logger.info(f"Creating ML class {ml_class} on corpus {self.corpus_id}")
            try:
                response = self.request(
                    "CreateMLClass", id=self.corpus_id, body={"name": ml_class}
                )
-                ml_class_id = self.classes[self.corpus_id][ml_class] = response["id"]
+                ml_class_id = self.classes[ml_class] = response["id"]
                logger.debug(f"Created ML class {response['id']}")
            except ErrorResponse as e:
                # Only reload for 400 errors
@@ -59,9 +59,9 @@ class ClassificationMixin(object):
                )
                self.load_corpus_classes()
                assert (
-                    ml_class in self.classes[self.corpus_id]
+                    ml_class in self.classes
                ), "Missing class {ml_class} even after reloading"
-                ml_class_id = self.classes[self.corpus_id][ml_class]
+                ml_class_id = self.classes[ml_class]

        return ml_class_id

@@ -73,14 +73,14 @@ class ClassificationMixin(object):
        :return: The MLClass's name
        """
        # Load the corpus' MLclasses if they are not available yet
-        if self.corpus_id not in self.classes:
+        if not self.classes:
            self.load_corpus_classes()

        # Filter classes by this ml_class_id
        ml_class_name = next(
            filter(
-                lambda x: self.classes[self.corpus_id][x] == ml_class_id,
-                self.classes[self.corpus_id],
+                lambda x: self.classes[x] == ml_class_id,
+                self.classes,
            ),
            None,
        )

--- a/arkindex_worker/worker/entity.py
+++ b/arkindex_worker/worker/entity.py
@@ -3,8 +3,7 @@
 ElementsWorker methods for entities.
 """

-from enum import Enum
-from typing import Dict, Optional, Union
+from typing import Dict, List, Optional, Union

 from peewee import IntegrityError

@@ -13,26 +12,54 @@ from arkindex_worker.cache import CachedElement, CachedEntity, CachedTranscripti
 from arkindex_worker.models import Element, Transcription


-class EntityType(Enum):
+class MissingEntityType(Exception):
    """
-    Type of an entity.
+    Raised when the specified entity type was not found in the corpus and
+    the worker cannot create it.
    """

-    Person = "person"
-    Location = "location"
-    Subject = "subject"
-    Organization = "organization"
-    Misc = "misc"
-    Number = "number"
-    Date = "date"
-

 class EntityMixin(object):
+    def check_required_entity_types(
+        self, entity_types: List[str], create_missing: bool = True
+    ):
+        """Checks that every entity type needed is available in the corpus.
+        Missing ones may be created automatically if needed.
+
+        :param entity_types: Entity type names to search.
+        :param create_missing: Whether the missing types should be created. Defaults to True.
+        :raises MissingEntityType: When an entity type is missing and cannot create.
+        """
+        # Retrieve entity_type ID
+        if not self.entity_types:
+            # Load entity_types of corpus
+            self.list_corpus_entity_types()
+        for entity_type in entity_types:
+            # Do nothing if type already exists
+            if entity_type in self.entity_types:
+                continue
+
+            # Do not create missing if not requested
+            if not create_missing:
+                raise MissingEntityType(
+                    f"Entity type `{entity_type}` was not in the corpus."
+                )
+
+            # Create type if non-existent
+            self.entity_types[entity_type] = self.request(
+                "CreateEntityType",
+                body={
+                    "name": entity_type,
+                    "corpus": self.corpus_id,
+                },
+            )["id"]
+            logger.info(f"Created a new entity type with name `{entity_type}`.")
+
    def create_entity(
        self,
        element: Union[Element, CachedElement],
        name: str,
-        type: EntityType,
+        type: str,
        metas=dict(),
        validated=None,
    ):
@@ -52,8 +79,8 @@ class EntityMixin(object):
            name, str
        ), "name shouldn't be null and should be of type str"
        assert type and isinstance(
-            type, EntityType
-        ), "type shouldn't be null and should be of type EntityType"
+            type, str
+        ), "type shouldn't be null and should be of type str"
        if metas:
            assert isinstance(metas, dict), "metas should be of type dict"
        if validated is not None:
@@ -62,18 +89,26 @@ class EntityMixin(object):
            logger.warning("Cannot create entity as this worker is in read-only mode")
            return

+        # Retrieve entity_type ID
+        if not self.entity_types:
+            # Load entity_types of corpus
+            self.list_corpus_entity_types()
+
+        entity_type_id = self.entity_types.get(type)
+        assert entity_type_id, f"Entity type `{type}` not found in the corpus."
+
        entity = self.request(
            "CreateEntity",
            body={
                "name": name,
-                "type": type.value,
+                "type_id": entity_type_id,
                "metas": metas,
                "validated": validated,
                "corpus": self.corpus_id,
                "worker_run_id": self.worker_run_id,
            },
        )
-        self.report.add_entity(element.id, entity["id"], type.value, name)
+        self.report.add_entity(element.id, entity["id"], type, name)

        if self.use_cache:
            # Store entity in local cache
@@ -81,7 +116,7 @@ class EntityMixin(object):
                to_insert = [
                    {
                        "id": entity["id"],
-                        "type": type.value,
+                        "type": type,
                        "name": name,
                        "validated": validated if validated is not None else False,
                        "metas": metas,
@@ -225,3 +260,19 @@ class EntityMixin(object):
        return self.api_client.paginate(
            "ListCorpusEntities", id=self.corpus_id, **query_params
        )
+
+    def list_corpus_entity_types(
+        self,
+    ):
+        """
+        Loads available entity types in corpus.
+        """
+        self.entity_types = {
+            entity_type["name"]: entity_type["id"]
+            for entity_type in self.api_client.paginate(
+                "ListCorpusEntityTypes", id=self.corpus_id
+            )
+        }
+        logger.info(
+            f"Loaded {len(self.entity_types)} entity types in corpus ({self.corpus_id})."
+        )
--- a/docs/contents/implem/configure.md
+++ b/docs/contents/implem/configure.md
+# Configuration
+
+When the worker is running over elements, be it locally or on Arkindex, the first step before actually doing
+anything is configuration. This process is implemented in the `configure` method.
+This method can also be overloaded if the worker needs additional configuration steps.
+
+The developer mode was designed to help worker developers reproduce and test how their worker
+would behave on Arkindex. This is why the configuration process in this mode mirrors the operations done on Arkindex while
+replacing configuration API calls by CLI arguments.
+
+The developer mode (or `read-only` mode) is enabled when at least either:
+
+- the `--dev` CLI argument is used,
+- the `ARKINDEX_WORKER_RUN_ID` variable was not set in the environment.
+
+None of these happen when running on Arkindex.
+
+## Parallel between both modes
+
+```mermaid
+flowchart TB
+    subgraph configure[Configuration step]
+        argument_parsing[CLI argument parsing]
+    end
+    argument_parsing --> is_read_only{IsReadOnly?}
+    is_read_only -- Yes --> devMode
+    is_read_only -- No --> arkindexMode
+    subgraph arkindexMode[Arkindex mode]
+        direction TB
+        subgraph workerConfiguration[Worker configuration]
+            direction TB
+            retrieveWorkerRun["API call to RetrieveWorkerRun"] --> userconfig_defaults[Initialize user configuration with default values]
+            userconfig_defaults --> load_secrets_API["Load Secrets using API calls to RetrieveSecret"]
+            load_secrets_API --> load_user_config[Override user configuration by values set by user]
+            load_user_config --> load_model_config["Load model configuration"]
+        end
+        workerConfiguration --> cacheConfiguration
+        subgraph cacheConfiguration[Base worker cache setup]
+            direction TB
+            get_paths_from_parent_tasks["Retrieve paths of parent tasks' cache databases"] --> initialize_db[Create cache database and its tables]
+            initialize_db --> merge_parent_databases[Merge parents databases]
+        end
+    end
+
+    subgraph devMode[Developer mode]
+        direction TB
+        subgraph devWorkerConfiguration[Worker configuration]
+            direction TB
+            configuration_parsing[CLI config argument parsing] --> corpus_id[Read Corpus ID from environment]
+            corpus_id --> load_secrets[Load secret in local developer storage]
+        end
+    end
+    classDef pyMeth font-style:italic
+```
+
+## Arkindex mode
+The details of a worker execution (what is called a **WorkerRun**) on Arkindex are stored in the backend. The first step of the configuration is to retrieve this information using the Arkindex API. The [RetrieveWorkerRun](https://demo.arkindex.org/api-docs/#tag/process/operation/RetrieveWorkerRun) endpoint gives information about:
+
+- the running process,
+- the configuration parameters that the user may have added from the frontend,
+- the worker used,
+- the version of this worker,
+- the configuration stored in this version,
+- the model version used in this worker execution,
+- the configuration stored in this model version.
+
+This step shows that there are a lot of sources for the actual configuration that the worker can use. Nothing is overridden by default, the worker has to do it in its overridden version of the configure method. In the end, any parameter set by the user **must** be applied over other known configurations.
+
+!!! warning
+
+    The convention is to always give the final word to the user. This means that when the user configuration is filled, its values must be the last to override the worker's `config` attribute. If a model configuration was set, its values must override this attribute before the user configuration's.
+
+The worker configuration may specify default values for some parameters (see [this section](../workers/yaml.md#setting-up-user-configurable-parameters) for more details about worker configuration). These default values are stored in the `user_configuration` dictionary attribute.
+
+This is also when the secrets (see [this section](../secrets/usage.md#declaring-secrets-in-workers) to learn more about secrets) are actually downloaded. They are stored in the `secrets` dictionary attribute.
+
+An Arkindex-mode exclusive step is done after all that: the cache setup. Some workers benefit a lot, performance-wise, from having a SQLite cache artifact from previous workers. This is mostly used in processes with multiple workers with dependencies, where the second worker needs the results of the first one to work. The database is initialized, the tables created and its version checked as it must match the one supported by the Arkindex instances. The database is then merged with any other database generated by previous worker runs.
+
+## Developer mode
+In the developer mode, the worker execution is not linked to anything on Arkindex. Therefore, the only configuration the worker can use is provided via the `--config` CLI argument. It supports YAML-formatted file and it should be similar to the `configuration` section of the [worker configuration file](../workers/yaml/#single-worker-configuration), without the `user_configuration` details. More details about how to create the local worker configuration are available in [this section](../workers/run-local/).
+
+The multiple configuration sources from the Arkindex-mode are merged into a unique one here. The configuration parameters are parsed as well as the list of required secrets. The secrets are loaded using a local Arkindex client. Again, see the [section about local execution](../workers/run-local/) for more details.
+
+One information cannot be retrieved directly from the configuration file and is required in some cases: the ID of the Arkindex corpus which the elements processed belong to. This is retrieved via the `ARKINDEX_CORPUS_ID` environment variable.
+
+## Worker reporter
+At the end of a worker execution, a report about the publication done by the worker is generated in JSON-format. This lists
+
+- the starting time,
+- the number of elements created, grouped by type,
+- the number of transcription created,
+- the number of classifications created, grouped by class,
+- the number of entities created,
+- the number of entities created on transcriptions,
+- the number of metadatas created,
+- the encountered errors' logs.
+
+This is done by the many helper described in the [reporting module](../../ref/reporting.md). They use the `report` attribute initialized at the configuration stage.
+
+## Setting Debug logging level
+There are three ways to activate the debug mode:
+
+- the `--verbose` CLI argument,
+- setting the `ARKINDEX_DEBUG` environment variable to `True`,
+- setting `"debug": True` in the worker's configuration via any configuration source.
+
+## Important class attributes
+Many attributes are set on the worker during at the configuration stage. Here is a *non-exhaustive* list with some details about their source and their usage.
+
+
+`api_client`
+: The Arkindex API client used by the worker to make the requests. One should not rely on this attribute to make API calls but use the many helpers available. The exception is for endpoints where no helper are available.
+
+`args`
+: The arguments passed via the CLI. This is used to trigger the Developer mode via `--dev`, to specify the configuration file via `--config` and to list elements to process via `--element`.
+
+`config`
+: A dictionary with the worker's configuration. This is filled by the worker run's configuration, the worker version's and the model version's if there is any.
+
+`corpus_id`
+: The ID of the corpus linked to the current process. This is mostly needed when publishing objects linked to a corpus like `Entities`. You may set it in developer mode via the `ARKINDEX_CORPUS_ID` environment variable.
+
+`is_read_only`
+: This is the computed property that determines which mode should be used. The Developer mode prevents any actual publication on Arkindex, hence the name `read_only`.
+
+`model_configuration`
+: The parsed configuration as stored in the `ModelVersion` object on Arkindex.
+
+`process_information`
+: The details about the process parent to this worker execution. Only set in Arkindex mode.
+
+`reporter`
+: The `Reporter` instance that will generate the `ml_report.json` artifacts which sums up the publication done during this execution and the errors encountered.
+
+`secrets`
+: A dictionary mapping the secret name to their parsed content.
+
+`use_cache`
+: Whether the cache optimization is available or not.
+
+`user_configuration`
+: The parsed configuration as the user entered it via the Arkindex frontend. Any parameter not specified will be filled with its default value if there is one.
+
+`worker_details`
+: The details of the worker used in this execution.
+
+`worker_run_id`
+: The ID of the `WorkerRun` corresponding object on the Arkindex instance. In Arkindex mode, this is used in `RetrieveWorkerRun` API call to retrieve the configuration and other necessary information. In developer mode, this is not set nor used.
+
+`worker_version_id`
+: The ID of the `WorkerVersion` object linked to the current `WorkerRun`. Like the `worker_run_id` attribute, this is not set nor used in developer mode.
--- a/docs/contents/implem/index.md
+++ b/docs/contents/implem/index.md
+# Worker Implementation
+
+This section presents
+
+- the different stages happening during a worker execution:
+    - the initialization
+    - the [configuration](./configure.md)
+    - the execution
+- the conception of a worker
+    - the architecture
+    - additional configuration steps
+    - element processing
+
+The following graph describes what happens when running the worker, either on Arkindex or locally. Words in italic font are actual method calls in the worker.
+
+```mermaid
+flowchart LR
+    subgraph all[Worker execution]
+        direction LR
+        subgraph id1[Worker initialization]
+            init
+        end
+        run -.-> configure
+        subgraph id2[Inference]
+            direction TB
+            configure --> list_elements
+            list_elements --> element_processing
+            subgraph id3[Loop over each element]
+                element_processing --> element_processing
+            end
+            element_processing -- Save ML report to disk --> reporting
+        end
+        init --> run
+    end
+    classDef pyMeth font-style:italic
+    class init,run,configure,list_elements pyMeth
+```
+
+More details about the `element_processing` step.
+
+```mermaid
+flowchart LR
+    subgraph all[Element processing]
+        direction LR
+        subgraph id1[Element details retrieval]
+            retrieve_element
+        end
+        retrieve_element --> update_activity_started
+        subgraph id2[Processing]
+            direction LR
+            update_activity_started[update_activity] --> process_element -- No errors --> update_activity_processed
+            update_activity_started -- to Started --> update_activity_started
+            update_activity_processed[update_activity] -- to Processed --> update_activity_processed
+            update_activity_error[update_activity] -- to Error --> update_activity_error
+        end
+        process_element -- Errors found --> update_activity_error
+    end
+    classDef pyMeth font-style:italic
+    class process_element,update_activity_started,update_activity_error,update_activity_processed pyMeth
+```
+
--- a/docs/contents/workers/run-local.md
+++ b/docs/contents/workers/run-local.md
@@ -20,7 +20,7 @@ For a worker to run properly, you will need two types of credentials:
 For the worker to run, you will need an Arkindex authentication token.
 You can use your own account's token when testing on your own machine.

-You can retrieve your personal API Token from your [profile page](https://doc.arkindex.org/users/auth/index.md#personal-token).
+You can retrieve your personal API Token from your [profile page](https://doc.arkindex.org/users/auth/#personal-token).

 ### Retrieving a worker version ID


--- a/docs/ref/api/entity.md
+++ b/docs/ref/api/entity.md
@@ -3,7 +3,7 @@
 ::: arkindex_worker.worker.entity
    options:
      members:
-        - EntityType
+        - MissingEntityType
    options:
      show_category_heading: no
 ::: arkindex_worker.worker.entity.EntityMixin

--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -70,6 +70,9 @@ nav:
  - Using secrets in workers:
    - contents/secrets/index.md
    - Usage: contents/secrets/usage.md
+  - Worker Implementation:
+      - contents/implem/index.md
+      - Configuration: contents/implem/configure.md
  - Python Reference:
      - Base Worker: ref/base_worker.md
      - Elements Worker: ref/elements_worker.md
@@ -101,7 +104,11 @@ markdown_extensions:
    - admonition # syntax coloration in code blocks
    - codehilite
    - pymdownx.details
-    - pymdownx.superfences
+    - pymdownx.superfences:
+        custom_fences:
+          - name: mermaid
+            class: mermaid
+            format: !!python/name:pymdownx.superfences.fence_code_format # yamllint disable-line

 copyright:  Copyright &copy; Teklia


--- a/tests/test_elements_worker/test_classifications.py
+++ b/tests/test_elements_worker/test_classifications.py
@@ -41,18 +41,14 @@ def test_get_ml_class_id_load_classes(responses, mock_elements_worker):
            f"http://testserver/api/v1/corpus/{corpus_id}/classes/",
        ),
    ]
-    assert mock_elements_worker.classes == {
-        "11111111-1111-1111-1111-111111111111": {"good": "0000"}
-    }
+    assert mock_elements_worker.classes == {"good": "0000"}
    assert ml_class_id == "0000"


 def test_get_ml_class_id_inexistant_class(mock_elements_worker, responses):
    # A missing class is now created automatically
    corpus_id = "11111111-1111-1111-1111-111111111111"
-    mock_elements_worker.classes = {
-        "11111111-1111-1111-1111-111111111111": {"good": "0000"}
-    }
+    mock_elements_worker.classes = {"good": "0000"}

    responses.add(
        responses.POST,
@@ -62,26 +58,20 @@ def test_get_ml_class_id_inexistant_class(mock_elements_worker, responses):
    )

    # Missing class at first
-    assert mock_elements_worker.classes == {
-        "11111111-1111-1111-1111-111111111111": {"good": "0000"}
-    }
+    assert mock_elements_worker.classes == {"good": "0000"}

    ml_class_id = mock_elements_worker.get_ml_class_id("bad")
    assert ml_class_id == "new-ml-class-1234"

    # Now it's available
    assert mock_elements_worker.classes == {
-        "11111111-1111-1111-1111-111111111111": {
-            "good": "0000",
-            "bad": "new-ml-class-1234",
-        }
+        "good": "0000",
+        "bad": "new-ml-class-1234",
    }


 def test_get_ml_class_id(mock_elements_worker):
-    mock_elements_worker.classes = {
-        "11111111-1111-1111-1111-111111111111": {"good": "0000"}
-    }
+    mock_elements_worker.classes = {"good": "0000"}

    ml_class_id = mock_elements_worker.get_ml_class_id("good")
    assert ml_class_id == "0000"
@@ -139,10 +129,8 @@ def test_get_ml_class_reload(responses, mock_elements_worker):

    assert len(responses.calls) == len(BASE_API_CALLS) + 3
    assert mock_elements_worker.classes == {
-        corpus_id: {
-            "class1": "class1_id",
-            "class2": "class2_id",
-        }
+        "class1": "class1_id",
+        "class2": "class2_id",
    }
    assert [
        (call.request.method, call.request.url) for call in responses.calls
@@ -166,7 +154,7 @@ def test_retrieve_ml_class_in_cache(mock_elements_worker):
    """
    Look for a class that exists in cache -> No API Call
    """
-    mock_elements_worker.classes[mock_elements_worker.corpus_id] = {"class1": "uuid1"}
+    mock_elements_worker.classes = {"class1": "uuid1"}

    assert mock_elements_worker.retrieve_ml_class("uuid1") == "class1"

@@ -262,9 +250,7 @@ def test_create_classification_wrong_ml_class(mock_elements_worker, responses):
        status=201,
        json={"id": "new-classification-1234"},
    )
-    mock_elements_worker.classes = {
-        "11111111-1111-1111-1111-111111111111": {"another_class": "0000"}
-    }
+    mock_elements_worker.classes = {"another_class": "0000"}
    mock_elements_worker.create_classification(
        element=elt,
        ml_class="a_class",
@@ -298,9 +284,7 @@ def test_create_classification_wrong_ml_class(mock_elements_worker, responses):


 def test_create_classification_wrong_confidence(mock_elements_worker):
-    mock_elements_worker.classes = {
-        "11111111-1111-1111-1111-111111111111": {"a_class": "0000"}
-    }
+    mock_elements_worker.classes = {"a_class": "0000"}
    elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
    with pytest.raises(AssertionError) as e:
        mock_elements_worker.create_classification(
@@ -352,9 +336,7 @@ def test_create_classification_wrong_confidence(mock_elements_worker):


 def test_create_classification_wrong_high_confidence(mock_elements_worker):
-    mock_elements_worker.classes = {
-        "11111111-1111-1111-1111-111111111111": {"a_class": "0000"}
-    }
+    mock_elements_worker.classes = {"a_class": "0000"}
    elt = Element({"id": "12341234-1234-1234-1234-123412341234"})

    with pytest.raises(AssertionError) as e:
@@ -381,9 +363,7 @@ def test_create_classification_wrong_high_confidence(mock_elements_worker):


 def test_create_classification_api_error(responses, mock_elements_worker):
-    mock_elements_worker.classes = {
-        "11111111-1111-1111-1111-111111111111": {"a_class": "0000"}
-    }
+    mock_elements_worker.classes = {"a_class": "0000"}
    elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
    responses.add(
        responses.POST,
@@ -413,9 +393,7 @@ def test_create_classification_api_error(responses, mock_elements_worker):


 def test_create_classification(responses, mock_elements_worker):
-    mock_elements_worker.classes = {
-        "11111111-1111-1111-1111-111111111111": {"a_class": "0000"}
-    }
+    mock_elements_worker.classes = {"a_class": "0000"}
    elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
    responses.add(
        responses.POST,
@@ -452,9 +430,7 @@ def test_create_classification(responses, mock_elements_worker):


 def test_create_classification_with_cache(responses, mock_elements_worker_with_cache):
-    mock_elements_worker_with_cache.classes = {
-        "11111111-1111-1111-1111-111111111111": {"a_class": "0000"}
-    }
+    mock_elements_worker_with_cache.classes = {"a_class": "0000"}
    elt = CachedElement.create(id="12341234-1234-1234-1234-123412341234", type="thing")

    responses.add(
@@ -513,9 +489,7 @@ def test_create_classification_with_cache(responses, mock_elements_worker_with_c


 def test_create_classification_duplicate_worker_run(responses, mock_elements_worker):
-    mock_elements_worker.classes = {
-        "11111111-1111-1111-1111-111111111111": {"a_class": "0000"}
-    }
+    mock_elements_worker.classes = {"a_class": "0000"}
    elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
    responses.add(
        responses.POST,
@@ -870,9 +844,10 @@ def test_create_classifications(responses, mock_elements_worker_with_cache):
    # Set MLClass in cache
    portrait_uuid = str(uuid4())
    landscape_uuid = str(uuid4())
-    mock_elements_worker_with_cache.classes[
-        mock_elements_worker_with_cache.corpus_id
-    ] = {"portrait": portrait_uuid, "landscape": landscape_uuid}
+    mock_elements_worker_with_cache.classes = {
+        "portrait": portrait_uuid,
+        "landscape": landscape_uuid,
+    }

    elt = CachedElement.create(id="12341234-1234-1234-1234-123412341234", type="thing")
    classes = [

--- a/tests/test_elements_worker/test_elements.py
+++ b/tests/test_elements_worker/test_elements.py
@@ -350,11 +350,9 @@ def test_load_corpus_classes(responses, mock_elements_worker):
        ),
    ]
    assert mock_elements_worker.classes == {
-        "11111111-1111-1111-1111-111111111111": {
-            "good": "0000",
-            "average": "1111",
-            "bad": "2222",
-        }
+        "good": "0000",
+        "average": "1111",
+        "bad": "2222",
    }



--- a/tests/test_elements_worker/test_entities.py
+++ b/tests/test_elements_worker/test_entities.py
@@ -4,6 +4,7 @@ from uuid import UUID

 import pytest
 from apistar.exceptions import ErrorResponse
+from responses import matchers

 from arkindex_worker.cache import (
    CachedElement,
@@ -12,7 +13,7 @@ from arkindex_worker.cache import (
    CachedTranscriptionEntity,
 )
 from arkindex_worker.models import Element, Transcription
-from arkindex_worker.worker import EntityType
+from arkindex_worker.worker.entity import MissingEntityType
 from arkindex_worker.worker.transcription import TextOrientation

 from . import BASE_API_CALLS
@@ -23,7 +24,7 @@ def test_create_entity_wrong_element(mock_elements_worker):
        mock_elements_worker.create_entity(
            element=None,
            name="Bob Bob",
-            type=EntityType.Person,
+            type="person",
        )
    assert (
        str(e.value)
@@ -34,7 +35,7 @@ def test_create_entity_wrong_element(mock_elements_worker):
        mock_elements_worker.create_entity(
            element="not element type",
            name="Bob Bob",
-            type=EntityType.Person,
+            type="person",
        )
    assert (
        str(e.value)
@@ -49,7 +50,7 @@ def test_create_entity_wrong_name(mock_elements_worker):
        mock_elements_worker.create_entity(
            element=elt,
            name=None,
-            type=EntityType.Person,
+            type="person",
        )
    assert str(e.value) == "name shouldn't be null and should be of type str"

@@ -57,7 +58,7 @@ def test_create_entity_wrong_name(mock_elements_worker):
        mock_elements_worker.create_entity(
            element=elt,
            name=1234,
-            type=EntityType.Person,
+            type="person",
        )
    assert str(e.value) == "name shouldn't be null and should be of type str"

@@ -71,7 +72,7 @@ def test_create_entity_wrong_type(mock_elements_worker):
            name="Bob Bob",
            type=None,
        )
-    assert str(e.value) == "type shouldn't be null and should be of type EntityType"
+    assert str(e.value) == "type shouldn't be null and should be of type str"

    with pytest.raises(AssertionError) as e:
        mock_elements_worker.create_entity(
@@ -79,15 +80,7 @@ def test_create_entity_wrong_type(mock_elements_worker):
            name="Bob Bob",
            type=1234,
        )
-    assert str(e.value) == "type shouldn't be null and should be of type EntityType"
-
-    with pytest.raises(AssertionError) as e:
-        mock_elements_worker.create_entity(
-            element=elt,
-            name="Bob Bob",
-            type="not_an_entity_type",
-        )
-    assert str(e.value) == "type shouldn't be null and should be of type EntityType"
+    assert str(e.value) == "type shouldn't be null and should be of type str"


 def test_create_entity_wrong_corpus(monkeypatch, mock_elements_worker):
@@ -99,7 +92,7 @@ def test_create_entity_wrong_corpus(monkeypatch, mock_elements_worker):
        mock_elements_worker.create_entity(
            element=elt,
            name="Bob Bob",
-            type=EntityType.Person,
+            type="person",
            metas="wrong metas",
        )
    assert str(e.value) == "metas should be of type dict"
@@ -112,7 +105,7 @@ def test_create_entity_wrong_metas(mock_elements_worker):
        mock_elements_worker.create_entity(
            element=elt,
            name="Bob Bob",
-            type=EntityType.Person,
+            type="person",
            metas="wrong metas",
        )
    assert str(e.value) == "metas should be of type dict"
@@ -125,13 +118,15 @@ def test_create_entity_wrong_validated(mock_elements_worker):
        mock_elements_worker.create_entity(
            element=elt,
            name="Bob Bob",
-            type=EntityType.Person,
+            type="person",
            validated="wrong validated",
        )
    assert str(e.value) == "validated should be of type bool"


 def test_create_entity_api_error(responses, mock_elements_worker):
+    # Set one entity type
+    mock_elements_worker.entity_types = {"person": "person-entity-type-id"}
    elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
    responses.add(
        responses.POST,
@@ -143,7 +138,7 @@ def test_create_entity_api_error(responses, mock_elements_worker):
        mock_elements_worker.create_entity(
            element=elt,
            name="Bob Bob",
-            type=EntityType.Person,
+            type="person",
        )

    assert len(responses.calls) == len(BASE_API_CALLS) + 5
@@ -160,6 +155,9 @@ def test_create_entity_api_error(responses, mock_elements_worker):


 def test_create_entity(responses, mock_elements_worker):
+    # Set one entity type
+    mock_elements_worker.entity_types = {"person": "person-entity-type-id"}
+
    elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
    responses.add(
        responses.POST,
@@ -171,7 +169,7 @@ def test_create_entity(responses, mock_elements_worker):
    entity_id = mock_elements_worker.create_entity(
        element=elt,
        name="Bob Bob",
-        type=EntityType.Person,
+        type="person",
    )

    assert len(responses.calls) == len(BASE_API_CALLS) + 1
@@ -182,7 +180,7 @@ def test_create_entity(responses, mock_elements_worker):
    ]
    assert json.loads(responses.calls[-1].request.body) == {
        "name": "Bob Bob",
-        "type": "person",
+        "type_id": "person-entity-type-id",
        "metas": {},
        "validated": None,
        "corpus": "11111111-1111-1111-1111-111111111111",
@@ -191,7 +189,49 @@ def test_create_entity(responses, mock_elements_worker):
    assert entity_id == "12345678-1234-1234-1234-123456789123"


+def test_create_entity_missing_type(responses, mock_elements_worker):
+    """
+    Create entity with an unknown type will fail.
+    """
+    elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
+
+    # Call to list entity types
+    responses.add(
+        responses.GET,
+        "http://testserver/api/v1/corpus/11111111-1111-1111-1111-111111111111/entity-types/",
+        status=200,
+        json={
+            "count": 1,
+            "next": None,
+            "results": [
+                {"id": "person-entity-type-id", "name": "person", "color": "00d1b2"}
+            ],
+        },
+    )
+
+    with pytest.raises(
+        AssertionError, match="Entity type `new-entity` not found in the corpus."
+    ):
+        mock_elements_worker.create_entity(
+            element=elt,
+            name="Bob Bob",
+            type="new-entity",
+        )
+
+    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [
+        (
+            "GET",
+            "http://testserver/api/v1/corpus/11111111-1111-1111-1111-111111111111/entity-types/",
+        ),
+    ]
+
+
 def test_create_entity_with_cache(responses, mock_elements_worker_with_cache):
+    # Set one entity type
+    mock_elements_worker_with_cache.entity_types = {"person": "person-entity-type-id"}
    elt = CachedElement.create(id="12341234-1234-1234-1234-123412341234", type="thing")
    responses.add(
        responses.POST,
@@ -203,7 +243,7 @@ def test_create_entity_with_cache(responses, mock_elements_worker_with_cache):
    entity_id = mock_elements_worker_with_cache.create_entity(
        element=elt,
        name="Bob Bob",
-        type=EntityType.Person,
+        type="person",
    )

    assert len(responses.calls) == len(BASE_API_CALLS) + 1
@@ -215,7 +255,7 @@ def test_create_entity_with_cache(responses, mock_elements_worker_with_cache):

    assert json.loads(responses.calls[-1].request.body) == {
        "name": "Bob Bob",
-        "type": "person",
+        "type_id": "person-entity-type-id",
        "metas": {},
        "validated": None,
        "corpus": "11111111-1111-1111-1111-111111111111",
@@ -784,3 +824,72 @@ def test_list_corpus_entities_wrong_parent(mock_elements_worker, wrong_parent):
    with pytest.raises(AssertionError) as e:
        mock_elements_worker.list_corpus_entities(parent=wrong_parent)
    assert str(e.value) == "parent should be of type Element"
+
+
+def test_check_required_entity_types(responses, mock_elements_worker):
+    # Set one entity type
+    mock_elements_worker.entity_types = {"person": "person-entity-type-id"}
+
+    checked_types = ["person", "new-entity"]
+
+    # Call to create new entity type
+    responses.add(
+        responses.POST,
+        "http://testserver/api/v1/entity/types/",
+        status=200,
+        match=[
+            matchers.json_params_matcher(
+                {
+                    "name": "new-entity",
+                    "corpus": "11111111-1111-1111-1111-111111111111",
+                }
+            )
+        ],
+        json={
+            "id": "new-entity-id",
+            "corpus": "11111111-1111-1111-1111-111111111111",
+            "name": "new-entity",
+            "color": "ffd1b3",
+        },
+    )
+
+    mock_elements_worker.check_required_entity_types(
+        entity_types=checked_types,
+    )
+
+    # Make sure the entity_types entry has been updated
+    assert mock_elements_worker.entity_types == {
+        "person": "person-entity-type-id",
+        "new-entity": "new-entity-id",
+    }
+
+    assert len(responses.calls) == len(BASE_API_CALLS) + 1
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS + [
+        (
+            "POST",
+            "http://testserver/api/v1/entity/types/",
+        ),
+    ]
+
+
+def test_check_required_entity_types_no_creation_allowed(
+    responses, mock_elements_worker
+):
+    # Set one entity type
+    mock_elements_worker.entity_types = {"person": "person-entity-type-id"}
+
+    checked_types = ["person", "new-entity"]
+
+    with pytest.raises(
+        MissingEntityType, match="Entity type `new-entity` was not in the corpus."
+    ):
+        mock_elements_worker.check_required_entity_types(
+            entity_types=checked_types, create_missing=False
+        )
+
+    assert len(responses.calls) == len(BASE_API_CALLS)
+    assert [
+        (call.request.method, call.request.url) for call in responses.calls
+    ] == BASE_API_CALLS
No results found