Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • workers/base-worker
1 result
Show changes
Commits on Source (5)
......@@ -30,7 +30,7 @@ repos:
- id: trailing-whitespace
- id: check-yaml
args: [--allow-multiple-documents]
exclude: "^worker-{{cookiecutter.slug}}/.arkindex.yml$"
exclude: "^worker-{{cookiecutter.slug}}/.arkindex.yml$|^mkdocs.yml$"
- id: mixed-line-ending
- id: name-tests-test
args: ['--django']
......
0.3.2-rc5
0.3.2-rc6
......@@ -19,7 +19,7 @@ from arkindex_worker.reporting import Reporter
from arkindex_worker.worker.base import BaseWorker
from arkindex_worker.worker.classification import ClassificationMixin
from arkindex_worker.worker.element import ElementMixin
from arkindex_worker.worker.entity import EntityMixin, EntityType # noqa: F401
from arkindex_worker.worker.entity import EntityMixin # noqa: F401
from arkindex_worker.worker.metadata import MetaDataMixin, MetaType # noqa: F401
from arkindex_worker.worker.transcription import TranscriptionMixin
from arkindex_worker.worker.version import WorkerVersionMixin # noqa: F401
......@@ -92,6 +92,10 @@ class ElementsWorker(
self.classes = {}
self.entity_types = {}
"""Known and available entity types in processed corpus
"""
self._worker_version_cache = {}
def list_elements(self) -> Union[Iterable[CachedElement], List[str]]:
......
......@@ -23,10 +23,10 @@ class ClassificationMixin(object):
"ListCorpusMLClasses",
id=self.corpus_id,
)
self.classes[self.corpus_id] = {
ml_class["name"]: ml_class["id"] for ml_class in corpus_classes
}
logger.info(f"Loaded {len(self.classes[self.corpus_id])} ML classes")
self.classes = {ml_class["name"]: ml_class["id"] for ml_class in corpus_classes}
logger.info(
f"Loaded {len(self.classes)} ML classes in corpus ({self.corpus_id})"
)
def get_ml_class_id(self, ml_class: str) -> str:
"""
......@@ -36,17 +36,17 @@ class ClassificationMixin(object):
:param ml_class: Name of the MLClass.
:returns: ID of the retrieved or created MLClass.
"""
if not self.classes.get(self.corpus_id):
if not self.classes:
self.load_corpus_classes()
ml_class_id = self.classes[self.corpus_id].get(ml_class)
ml_class_id = self.classes.get(ml_class)
if ml_class_id is None:
logger.info(f"Creating ML class {ml_class} on corpus {self.corpus_id}")
try:
response = self.request(
"CreateMLClass", id=self.corpus_id, body={"name": ml_class}
)
ml_class_id = self.classes[self.corpus_id][ml_class] = response["id"]
ml_class_id = self.classes[ml_class] = response["id"]
logger.debug(f"Created ML class {response['id']}")
except ErrorResponse as e:
# Only reload for 400 errors
......@@ -59,9 +59,9 @@ class ClassificationMixin(object):
)
self.load_corpus_classes()
assert (
ml_class in self.classes[self.corpus_id]
ml_class in self.classes
), "Missing class {ml_class} even after reloading"
ml_class_id = self.classes[self.corpus_id][ml_class]
ml_class_id = self.classes[ml_class]
return ml_class_id
......@@ -73,14 +73,14 @@ class ClassificationMixin(object):
:return: The MLClass's name
"""
# Load the corpus' MLclasses if they are not available yet
if self.corpus_id not in self.classes:
if not self.classes:
self.load_corpus_classes()
# Filter classes by this ml_class_id
ml_class_name = next(
filter(
lambda x: self.classes[self.corpus_id][x] == ml_class_id,
self.classes[self.corpus_id],
lambda x: self.classes[x] == ml_class_id,
self.classes,
),
None,
)
......
......@@ -3,8 +3,7 @@
ElementsWorker methods for entities.
"""
from enum import Enum
from typing import Dict, Optional, Union
from typing import Dict, List, Optional, Union
from peewee import IntegrityError
......@@ -13,26 +12,54 @@ from arkindex_worker.cache import CachedElement, CachedEntity, CachedTranscripti
from arkindex_worker.models import Element, Transcription
class EntityType(Enum):
class MissingEntityType(Exception):
"""
Type of an entity.
Raised when the specified entity type was not found in the corpus and
the worker cannot create it.
"""
Person = "person"
Location = "location"
Subject = "subject"
Organization = "organization"
Misc = "misc"
Number = "number"
Date = "date"
class EntityMixin(object):
def check_required_entity_types(
self, entity_types: List[str], create_missing: bool = True
):
"""Checks that every entity type needed is available in the corpus.
Missing ones may be created automatically if needed.
:param entity_types: Entity type names to search.
:param create_missing: Whether the missing types should be created. Defaults to True.
:raises MissingEntityType: When an entity type is missing and cannot create.
"""
# Retrieve entity_type ID
if not self.entity_types:
# Load entity_types of corpus
self.list_corpus_entity_types()
for entity_type in entity_types:
# Do nothing if type already exists
if entity_type in self.entity_types:
continue
# Do not create missing if not requested
if not create_missing:
raise MissingEntityType(
f"Entity type `{entity_type}` was not in the corpus."
)
# Create type if non-existent
self.entity_types[entity_type] = self.request(
"CreateEntityType",
body={
"name": entity_type,
"corpus": self.corpus_id,
},
)["id"]
logger.info(f"Created a new entity type with name `{entity_type}`.")
def create_entity(
self,
element: Union[Element, CachedElement],
name: str,
type: EntityType,
type: str,
metas=dict(),
validated=None,
):
......@@ -52,8 +79,8 @@ class EntityMixin(object):
name, str
), "name shouldn't be null and should be of type str"
assert type and isinstance(
type, EntityType
), "type shouldn't be null and should be of type EntityType"
type, str
), "type shouldn't be null and should be of type str"
if metas:
assert isinstance(metas, dict), "metas should be of type dict"
if validated is not None:
......@@ -62,18 +89,26 @@ class EntityMixin(object):
logger.warning("Cannot create entity as this worker is in read-only mode")
return
# Retrieve entity_type ID
if not self.entity_types:
# Load entity_types of corpus
self.list_corpus_entity_types()
entity_type_id = self.entity_types.get(type)
assert entity_type_id, f"Entity type `{type}` not found in the corpus."
entity = self.request(
"CreateEntity",
body={
"name": name,
"type": type.value,
"type_id": entity_type_id,
"metas": metas,
"validated": validated,
"corpus": self.corpus_id,
"worker_run_id": self.worker_run_id,
},
)
self.report.add_entity(element.id, entity["id"], type.value, name)
self.report.add_entity(element.id, entity["id"], type, name)
if self.use_cache:
# Store entity in local cache
......@@ -81,7 +116,7 @@ class EntityMixin(object):
to_insert = [
{
"id": entity["id"],
"type": type.value,
"type": type,
"name": name,
"validated": validated if validated is not None else False,
"metas": metas,
......@@ -225,3 +260,19 @@ class EntityMixin(object):
return self.api_client.paginate(
"ListCorpusEntities", id=self.corpus_id, **query_params
)
def list_corpus_entity_types(
self,
):
"""
Loads available entity types in corpus.
"""
self.entity_types = {
entity_type["name"]: entity_type["id"]
for entity_type in self.api_client.paginate(
"ListCorpusEntityTypes", id=self.corpus_id
)
}
logger.info(
f"Loaded {len(self.entity_types)} entity types in corpus ({self.corpus_id})."
)
# Configuration
When the worker is running over elements, be it locally or on Arkindex, the first step before actually doing
anything is configuration. This process is implemented in the `configure` method.
This method can also be overloaded if the worker needs additional configuration steps.
The developer mode was designed to help worker developers reproduce and test how their worker
would behave on Arkindex. This is why the configuration process in this mode mirrors the operations done on Arkindex while
replacing configuration API calls by CLI arguments.
The developer mode (or `read-only` mode) is enabled when at least either:
- the `--dev` CLI argument is used,
- the `ARKINDEX_WORKER_RUN_ID` variable was not set in the environment.
None of these happen when running on Arkindex.
## Parallel between both modes
```mermaid
flowchart TB
subgraph configure[Configuration step]
argument_parsing[CLI argument parsing]
end
argument_parsing --> is_read_only{IsReadOnly?}
is_read_only -- Yes --> devMode
is_read_only -- No --> arkindexMode
subgraph arkindexMode[Arkindex mode]
direction TB
subgraph workerConfiguration[Worker configuration]
direction TB
retrieveWorkerRun["API call to RetrieveWorkerRun"] --> userconfig_defaults[Initialize user configuration with default values]
userconfig_defaults --> load_secrets_API["Load Secrets using API calls to RetrieveSecret"]
load_secrets_API --> load_user_config[Override user configuration by values set by user]
load_user_config --> load_model_config["Load model configuration"]
end
workerConfiguration --> cacheConfiguration
subgraph cacheConfiguration[Base worker cache setup]
direction TB
get_paths_from_parent_tasks["Retrieve paths of parent tasks' cache databases"] --> initialize_db[Create cache database and its tables]
initialize_db --> merge_parent_databases[Merge parents databases]
end
end
subgraph devMode[Developer mode]
direction TB
subgraph devWorkerConfiguration[Worker configuration]
direction TB
configuration_parsing[CLI config argument parsing] --> corpus_id[Read Corpus ID from environment]
corpus_id --> load_secrets[Load secret in local developer storage]
end
end
classDef pyMeth font-style:italic
```
## Arkindex mode
The details of a worker execution (what is called a **WorkerRun**) on Arkindex are stored in the backend. The first step of the configuration is to retrieve this information using the Arkindex API. The [RetrieveWorkerRun](https://demo.arkindex.org/api-docs/#tag/process/operation/RetrieveWorkerRun) endpoint gives information about:
- the running process,
- the configuration parameters that the user may have added from the frontend,
- the worker used,
- the version of this worker,
- the configuration stored in this version,
- the model version used in this worker execution,
- the configuration stored in this model version.
This step shows that there are a lot of sources for the actual configuration that the worker can use. Nothing is overridden by default, the worker has to do it in its overridden version of the configure method. In the end, any parameter set by the user **must** be applied over other known configurations.
!!! warning
The convention is to always give the final word to the user. This means that when the user configuration is filled, its values must be the last to override the worker's `config` attribute. If a model configuration was set, its values must override this attribute before the user configuration's.
The worker configuration may specify default values for some parameters (see [this section](../workers/yaml.md#setting-up-user-configurable-parameters) for more details about worker configuration). These default values are stored in the `user_configuration` dictionary attribute.
This is also when the secrets (see [this section](../secrets/usage.md#declaring-secrets-in-workers) to learn more about secrets) are actually downloaded. They are stored in the `secrets` dictionary attribute.
An Arkindex-mode exclusive step is done after all that: the cache setup. Some workers benefit a lot, performance-wise, from having a SQLite cache artifact from previous workers. This is mostly used in processes with multiple workers with dependencies, where the second worker needs the results of the first one to work. The database is initialized, the tables created and its version checked as it must match the one supported by the Arkindex instances. The database is then merged with any other database generated by previous worker runs.
## Developer mode
In the developer mode, the worker execution is not linked to anything on Arkindex. Therefore, the only configuration the worker can use is provided via the `--config` CLI argument. It supports YAML-formatted file and it should be similar to the `configuration` section of the [worker configuration file](../workers/yaml/#single-worker-configuration), without the `user_configuration` details. More details about how to create the local worker configuration are available in [this section](../workers/run-local/).
The multiple configuration sources from the Arkindex-mode are merged into a unique one here. The configuration parameters are parsed as well as the list of required secrets. The secrets are loaded using a local Arkindex client. Again, see the [section about local execution](../workers/run-local/) for more details.
One information cannot be retrieved directly from the configuration file and is required in some cases: the ID of the Arkindex corpus which the elements processed belong to. This is retrieved via the `ARKINDEX_CORPUS_ID` environment variable.
## Worker reporter
At the end of a worker execution, a report about the publication done by the worker is generated in JSON-format. This lists
- the starting time,
- the number of elements created, grouped by type,
- the number of transcription created,
- the number of classifications created, grouped by class,
- the number of entities created,
- the number of entities created on transcriptions,
- the number of metadatas created,
- the encountered errors' logs.
This is done by the many helper described in the [reporting module](../../ref/reporting.md). They use the `report` attribute initialized at the configuration stage.
## Setting Debug logging level
There are three ways to activate the debug mode:
- the `--verbose` CLI argument,
- setting the `ARKINDEX_DEBUG` environment variable to `True`,
- setting `"debug": True` in the worker's configuration via any configuration source.
## Important class attributes
Many attributes are set on the worker during at the configuration stage. Here is a *non-exhaustive* list with some details about their source and their usage.
`api_client`
: The Arkindex API client used by the worker to make the requests. One should not rely on this attribute to make API calls but use the many helpers available. The exception is for endpoints where no helper are available.
`args`
: The arguments passed via the CLI. This is used to trigger the Developer mode via `--dev`, to specify the configuration file via `--config` and to list elements to process via `--element`.
`config`
: A dictionary with the worker's configuration. This is filled by the worker run's configuration, the worker version's and the model version's if there is any.
`corpus_id`
: The ID of the corpus linked to the current process. This is mostly needed when publishing objects linked to a corpus like `Entities`. You may set it in developer mode via the `ARKINDEX_CORPUS_ID` environment variable.
`is_read_only`
: This is the computed property that determines which mode should be used. The Developer mode prevents any actual publication on Arkindex, hence the name `read_only`.
`model_configuration`
: The parsed configuration as stored in the `ModelVersion` object on Arkindex.
`process_information`
: The details about the process parent to this worker execution. Only set in Arkindex mode.
`reporter`
: The `Reporter` instance that will generate the `ml_report.json` artifacts which sums up the publication done during this execution and the errors encountered.
`secrets`
: A dictionary mapping the secret name to their parsed content.
`use_cache`
: Whether the cache optimization is available or not.
`user_configuration`
: The parsed configuration as the user entered it via the Arkindex frontend. Any parameter not specified will be filled with its default value if there is one.
`worker_details`
: The details of the worker used in this execution.
`worker_run_id`
: The ID of the `WorkerRun` corresponding object on the Arkindex instance. In Arkindex mode, this is used in `RetrieveWorkerRun` API call to retrieve the configuration and other necessary information. In developer mode, this is not set nor used.
`worker_version_id`
: The ID of the `WorkerVersion` object linked to the current `WorkerRun`. Like the `worker_run_id` attribute, this is not set nor used in developer mode.
# Worker Implementation
This section presents
- the different stages happening during a worker execution:
- the initialization
- the [configuration](./configure.md)
- the execution
- the conception of a worker
- the architecture
- additional configuration steps
- element processing
The following graph describes what happens when running the worker, either on Arkindex or locally. Words in italic font are actual method calls in the worker.
```mermaid
flowchart LR
subgraph all[Worker execution]
direction LR
subgraph id1[Worker initialization]
init
end
run -.-> configure
subgraph id2[Inference]
direction TB
configure --> list_elements
list_elements --> element_processing
subgraph id3[Loop over each element]
element_processing --> element_processing
end
element_processing -- Save ML report to disk --> reporting
end
init --> run
end
classDef pyMeth font-style:italic
class init,run,configure,list_elements pyMeth
```
More details about the `element_processing` step.
```mermaid
flowchart LR
subgraph all[Element processing]
direction LR
subgraph id1[Element details retrieval]
retrieve_element
end
retrieve_element --> update_activity_started
subgraph id2[Processing]
direction LR
update_activity_started[update_activity] --> process_element -- No errors --> update_activity_processed
update_activity_started -- to Started --> update_activity_started
update_activity_processed[update_activity] -- to Processed --> update_activity_processed
update_activity_error[update_activity] -- to Error --> update_activity_error
end
process_element -- Errors found --> update_activity_error
end
classDef pyMeth font-style:italic
class process_element,update_activity_started,update_activity_error,update_activity_processed pyMeth
```
......@@ -20,7 +20,7 @@ For a worker to run properly, you will need two types of credentials:
For the worker to run, you will need an Arkindex authentication token.
You can use your own account's token when testing on your own machine.
You can retrieve your personal API Token from your [profile page](https://doc.arkindex.org/users/auth/index.md#personal-token).
You can retrieve your personal API Token from your [profile page](https://doc.arkindex.org/users/auth/#personal-token).
### Retrieving a worker version ID
......
......@@ -3,7 +3,7 @@
::: arkindex_worker.worker.entity
options:
members:
- EntityType
- MissingEntityType
options:
show_category_heading: no
::: arkindex_worker.worker.entity.EntityMixin
......
......@@ -70,6 +70,9 @@ nav:
- Using secrets in workers:
- contents/secrets/index.md
- Usage: contents/secrets/usage.md
- Worker Implementation:
- contents/implem/index.md
- Configuration: contents/implem/configure.md
- Python Reference:
- Base Worker: ref/base_worker.md
- Elements Worker: ref/elements_worker.md
......@@ -101,7 +104,11 @@ markdown_extensions:
- admonition # syntax coloration in code blocks
- codehilite
- pymdownx.details
- pymdownx.superfences
- pymdownx.superfences:
custom_fences:
- name: mermaid
class: mermaid
format: !!python/name:pymdownx.superfences.fence_code_format # yamllint disable-line
copyright: Copyright © Teklia
......
......@@ -41,18 +41,14 @@ def test_get_ml_class_id_load_classes(responses, mock_elements_worker):
f"http://testserver/api/v1/corpus/{corpus_id}/classes/",
),
]
assert mock_elements_worker.classes == {
"11111111-1111-1111-1111-111111111111": {"good": "0000"}
}
assert mock_elements_worker.classes == {"good": "0000"}
assert ml_class_id == "0000"
def test_get_ml_class_id_inexistant_class(mock_elements_worker, responses):
# A missing class is now created automatically
corpus_id = "11111111-1111-1111-1111-111111111111"
mock_elements_worker.classes = {
"11111111-1111-1111-1111-111111111111": {"good": "0000"}
}
mock_elements_worker.classes = {"good": "0000"}
responses.add(
responses.POST,
......@@ -62,26 +58,20 @@ def test_get_ml_class_id_inexistant_class(mock_elements_worker, responses):
)
# Missing class at first
assert mock_elements_worker.classes == {
"11111111-1111-1111-1111-111111111111": {"good": "0000"}
}
assert mock_elements_worker.classes == {"good": "0000"}
ml_class_id = mock_elements_worker.get_ml_class_id("bad")
assert ml_class_id == "new-ml-class-1234"
# Now it's available
assert mock_elements_worker.classes == {
"11111111-1111-1111-1111-111111111111": {
"good": "0000",
"bad": "new-ml-class-1234",
}
"good": "0000",
"bad": "new-ml-class-1234",
}
def test_get_ml_class_id(mock_elements_worker):
mock_elements_worker.classes = {
"11111111-1111-1111-1111-111111111111": {"good": "0000"}
}
mock_elements_worker.classes = {"good": "0000"}
ml_class_id = mock_elements_worker.get_ml_class_id("good")
assert ml_class_id == "0000"
......@@ -139,10 +129,8 @@ def test_get_ml_class_reload(responses, mock_elements_worker):
assert len(responses.calls) == len(BASE_API_CALLS) + 3
assert mock_elements_worker.classes == {
corpus_id: {
"class1": "class1_id",
"class2": "class2_id",
}
"class1": "class1_id",
"class2": "class2_id",
}
assert [
(call.request.method, call.request.url) for call in responses.calls
......@@ -166,7 +154,7 @@ def test_retrieve_ml_class_in_cache(mock_elements_worker):
"""
Look for a class that exists in cache -> No API Call
"""
mock_elements_worker.classes[mock_elements_worker.corpus_id] = {"class1": "uuid1"}
mock_elements_worker.classes = {"class1": "uuid1"}
assert mock_elements_worker.retrieve_ml_class("uuid1") == "class1"
......@@ -262,9 +250,7 @@ def test_create_classification_wrong_ml_class(mock_elements_worker, responses):
status=201,
json={"id": "new-classification-1234"},
)
mock_elements_worker.classes = {
"11111111-1111-1111-1111-111111111111": {"another_class": "0000"}
}
mock_elements_worker.classes = {"another_class": "0000"}
mock_elements_worker.create_classification(
element=elt,
ml_class="a_class",
......@@ -298,9 +284,7 @@ def test_create_classification_wrong_ml_class(mock_elements_worker, responses):
def test_create_classification_wrong_confidence(mock_elements_worker):
mock_elements_worker.classes = {
"11111111-1111-1111-1111-111111111111": {"a_class": "0000"}
}
mock_elements_worker.classes = {"a_class": "0000"}
elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
with pytest.raises(AssertionError) as e:
mock_elements_worker.create_classification(
......@@ -352,9 +336,7 @@ def test_create_classification_wrong_confidence(mock_elements_worker):
def test_create_classification_wrong_high_confidence(mock_elements_worker):
mock_elements_worker.classes = {
"11111111-1111-1111-1111-111111111111": {"a_class": "0000"}
}
mock_elements_worker.classes = {"a_class": "0000"}
elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
with pytest.raises(AssertionError) as e:
......@@ -381,9 +363,7 @@ def test_create_classification_wrong_high_confidence(mock_elements_worker):
def test_create_classification_api_error(responses, mock_elements_worker):
mock_elements_worker.classes = {
"11111111-1111-1111-1111-111111111111": {"a_class": "0000"}
}
mock_elements_worker.classes = {"a_class": "0000"}
elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
responses.add(
responses.POST,
......@@ -413,9 +393,7 @@ def test_create_classification_api_error(responses, mock_elements_worker):
def test_create_classification(responses, mock_elements_worker):
mock_elements_worker.classes = {
"11111111-1111-1111-1111-111111111111": {"a_class": "0000"}
}
mock_elements_worker.classes = {"a_class": "0000"}
elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
responses.add(
responses.POST,
......@@ -452,9 +430,7 @@ def test_create_classification(responses, mock_elements_worker):
def test_create_classification_with_cache(responses, mock_elements_worker_with_cache):
mock_elements_worker_with_cache.classes = {
"11111111-1111-1111-1111-111111111111": {"a_class": "0000"}
}
mock_elements_worker_with_cache.classes = {"a_class": "0000"}
elt = CachedElement.create(id="12341234-1234-1234-1234-123412341234", type="thing")
responses.add(
......@@ -513,9 +489,7 @@ def test_create_classification_with_cache(responses, mock_elements_worker_with_c
def test_create_classification_duplicate_worker_run(responses, mock_elements_worker):
mock_elements_worker.classes = {
"11111111-1111-1111-1111-111111111111": {"a_class": "0000"}
}
mock_elements_worker.classes = {"a_class": "0000"}
elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
responses.add(
responses.POST,
......@@ -870,9 +844,10 @@ def test_create_classifications(responses, mock_elements_worker_with_cache):
# Set MLClass in cache
portrait_uuid = str(uuid4())
landscape_uuid = str(uuid4())
mock_elements_worker_with_cache.classes[
mock_elements_worker_with_cache.corpus_id
] = {"portrait": portrait_uuid, "landscape": landscape_uuid}
mock_elements_worker_with_cache.classes = {
"portrait": portrait_uuid,
"landscape": landscape_uuid,
}
elt = CachedElement.create(id="12341234-1234-1234-1234-123412341234", type="thing")
classes = [
......
......@@ -350,11 +350,9 @@ def test_load_corpus_classes(responses, mock_elements_worker):
),
]
assert mock_elements_worker.classes == {
"11111111-1111-1111-1111-111111111111": {
"good": "0000",
"average": "1111",
"bad": "2222",
}
"good": "0000",
"average": "1111",
"bad": "2222",
}
......
......@@ -4,6 +4,7 @@ from uuid import UUID
import pytest
from apistar.exceptions import ErrorResponse
from responses import matchers
from arkindex_worker.cache import (
CachedElement,
......@@ -12,7 +13,7 @@ from arkindex_worker.cache import (
CachedTranscriptionEntity,
)
from arkindex_worker.models import Element, Transcription
from arkindex_worker.worker import EntityType
from arkindex_worker.worker.entity import MissingEntityType
from arkindex_worker.worker.transcription import TextOrientation
from . import BASE_API_CALLS
......@@ -23,7 +24,7 @@ def test_create_entity_wrong_element(mock_elements_worker):
mock_elements_worker.create_entity(
element=None,
name="Bob Bob",
type=EntityType.Person,
type="person",
)
assert (
str(e.value)
......@@ -34,7 +35,7 @@ def test_create_entity_wrong_element(mock_elements_worker):
mock_elements_worker.create_entity(
element="not element type",
name="Bob Bob",
type=EntityType.Person,
type="person",
)
assert (
str(e.value)
......@@ -49,7 +50,7 @@ def test_create_entity_wrong_name(mock_elements_worker):
mock_elements_worker.create_entity(
element=elt,
name=None,
type=EntityType.Person,
type="person",
)
assert str(e.value) == "name shouldn't be null and should be of type str"
......@@ -57,7 +58,7 @@ def test_create_entity_wrong_name(mock_elements_worker):
mock_elements_worker.create_entity(
element=elt,
name=1234,
type=EntityType.Person,
type="person",
)
assert str(e.value) == "name shouldn't be null and should be of type str"
......@@ -71,7 +72,7 @@ def test_create_entity_wrong_type(mock_elements_worker):
name="Bob Bob",
type=None,
)
assert str(e.value) == "type shouldn't be null and should be of type EntityType"
assert str(e.value) == "type shouldn't be null and should be of type str"
with pytest.raises(AssertionError) as e:
mock_elements_worker.create_entity(
......@@ -79,15 +80,7 @@ def test_create_entity_wrong_type(mock_elements_worker):
name="Bob Bob",
type=1234,
)
assert str(e.value) == "type shouldn't be null and should be of type EntityType"
with pytest.raises(AssertionError) as e:
mock_elements_worker.create_entity(
element=elt,
name="Bob Bob",
type="not_an_entity_type",
)
assert str(e.value) == "type shouldn't be null and should be of type EntityType"
assert str(e.value) == "type shouldn't be null and should be of type str"
def test_create_entity_wrong_corpus(monkeypatch, mock_elements_worker):
......@@ -99,7 +92,7 @@ def test_create_entity_wrong_corpus(monkeypatch, mock_elements_worker):
mock_elements_worker.create_entity(
element=elt,
name="Bob Bob",
type=EntityType.Person,
type="person",
metas="wrong metas",
)
assert str(e.value) == "metas should be of type dict"
......@@ -112,7 +105,7 @@ def test_create_entity_wrong_metas(mock_elements_worker):
mock_elements_worker.create_entity(
element=elt,
name="Bob Bob",
type=EntityType.Person,
type="person",
metas="wrong metas",
)
assert str(e.value) == "metas should be of type dict"
......@@ -125,13 +118,15 @@ def test_create_entity_wrong_validated(mock_elements_worker):
mock_elements_worker.create_entity(
element=elt,
name="Bob Bob",
type=EntityType.Person,
type="person",
validated="wrong validated",
)
assert str(e.value) == "validated should be of type bool"
def test_create_entity_api_error(responses, mock_elements_worker):
# Set one entity type
mock_elements_worker.entity_types = {"person": "person-entity-type-id"}
elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
responses.add(
responses.POST,
......@@ -143,7 +138,7 @@ def test_create_entity_api_error(responses, mock_elements_worker):
mock_elements_worker.create_entity(
element=elt,
name="Bob Bob",
type=EntityType.Person,
type="person",
)
assert len(responses.calls) == len(BASE_API_CALLS) + 5
......@@ -160,6 +155,9 @@ def test_create_entity_api_error(responses, mock_elements_worker):
def test_create_entity(responses, mock_elements_worker):
# Set one entity type
mock_elements_worker.entity_types = {"person": "person-entity-type-id"}
elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
responses.add(
responses.POST,
......@@ -171,7 +169,7 @@ def test_create_entity(responses, mock_elements_worker):
entity_id = mock_elements_worker.create_entity(
element=elt,
name="Bob Bob",
type=EntityType.Person,
type="person",
)
assert len(responses.calls) == len(BASE_API_CALLS) + 1
......@@ -182,7 +180,7 @@ def test_create_entity(responses, mock_elements_worker):
]
assert json.loads(responses.calls[-1].request.body) == {
"name": "Bob Bob",
"type": "person",
"type_id": "person-entity-type-id",
"metas": {},
"validated": None,
"corpus": "11111111-1111-1111-1111-111111111111",
......@@ -191,7 +189,49 @@ def test_create_entity(responses, mock_elements_worker):
assert entity_id == "12345678-1234-1234-1234-123456789123"
def test_create_entity_missing_type(responses, mock_elements_worker):
"""
Create entity with an unknown type will fail.
"""
elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
# Call to list entity types
responses.add(
responses.GET,
"http://testserver/api/v1/corpus/11111111-1111-1111-1111-111111111111/entity-types/",
status=200,
json={
"count": 1,
"next": None,
"results": [
{"id": "person-entity-type-id", "name": "person", "color": "00d1b2"}
],
},
)
with pytest.raises(
AssertionError, match="Entity type `new-entity` not found in the corpus."
):
mock_elements_worker.create_entity(
element=elt,
name="Bob Bob",
type="new-entity",
)
assert len(responses.calls) == len(BASE_API_CALLS) + 1
assert [
(call.request.method, call.request.url) for call in responses.calls
] == BASE_API_CALLS + [
(
"GET",
"http://testserver/api/v1/corpus/11111111-1111-1111-1111-111111111111/entity-types/",
),
]
def test_create_entity_with_cache(responses, mock_elements_worker_with_cache):
# Set one entity type
mock_elements_worker_with_cache.entity_types = {"person": "person-entity-type-id"}
elt = CachedElement.create(id="12341234-1234-1234-1234-123412341234", type="thing")
responses.add(
responses.POST,
......@@ -203,7 +243,7 @@ def test_create_entity_with_cache(responses, mock_elements_worker_with_cache):
entity_id = mock_elements_worker_with_cache.create_entity(
element=elt,
name="Bob Bob",
type=EntityType.Person,
type="person",
)
assert len(responses.calls) == len(BASE_API_CALLS) + 1
......@@ -215,7 +255,7 @@ def test_create_entity_with_cache(responses, mock_elements_worker_with_cache):
assert json.loads(responses.calls[-1].request.body) == {
"name": "Bob Bob",
"type": "person",
"type_id": "person-entity-type-id",
"metas": {},
"validated": None,
"corpus": "11111111-1111-1111-1111-111111111111",
......@@ -784,3 +824,72 @@ def test_list_corpus_entities_wrong_parent(mock_elements_worker, wrong_parent):
with pytest.raises(AssertionError) as e:
mock_elements_worker.list_corpus_entities(parent=wrong_parent)
assert str(e.value) == "parent should be of type Element"
def test_check_required_entity_types(responses, mock_elements_worker):
# Set one entity type
mock_elements_worker.entity_types = {"person": "person-entity-type-id"}
checked_types = ["person", "new-entity"]
# Call to create new entity type
responses.add(
responses.POST,
"http://testserver/api/v1/entity/types/",
status=200,
match=[
matchers.json_params_matcher(
{
"name": "new-entity",
"corpus": "11111111-1111-1111-1111-111111111111",
}
)
],
json={
"id": "new-entity-id",
"corpus": "11111111-1111-1111-1111-111111111111",
"name": "new-entity",
"color": "ffd1b3",
},
)
mock_elements_worker.check_required_entity_types(
entity_types=checked_types,
)
# Make sure the entity_types entry has been updated
assert mock_elements_worker.entity_types == {
"person": "person-entity-type-id",
"new-entity": "new-entity-id",
}
assert len(responses.calls) == len(BASE_API_CALLS) + 1
assert [
(call.request.method, call.request.url) for call in responses.calls
] == BASE_API_CALLS + [
(
"POST",
"http://testserver/api/v1/entity/types/",
),
]
def test_check_required_entity_types_no_creation_allowed(
responses, mock_elements_worker
):
# Set one entity type
mock_elements_worker.entity_types = {"person": "person-entity-type-id"}
checked_types = ["person", "new-entity"]
with pytest.raises(
MissingEntityType, match="Entity type `new-entity` was not in the corpus."
):
mock_elements_worker.check_required_entity_types(
entity_types=checked_types, create_missing=False
)
assert len(responses.calls) == len(BASE_API_CALLS)
assert [
(call.request.method, call.request.url) for call in responses.calls
] == BASE_API_CALLS