Skip to content
Snippets Groups Projects
Commit 63acfaa5 authored by Eva Bardou's avatar Eva Bardou :frog: Committed by Yoann Schneider
Browse files

Refactor and harmonize the element/entity type helpers

parent ad247c74
No related branches found
No related tags found
1 merge request!627Refactor and harmonize the element/entity type helpers
Pipeline #215238 passed
......@@ -507,7 +507,7 @@ def revert_orientation(
assert polygon and isinstance(polygon, list), (
"polygon shouldn't be null and should be a list"
)
assert isinstance(reverse, bool), "Reverse should be a bool"
assert isinstance(reverse, bool), "reverse should be a bool"
# Rotating with Pillow can cause it to move the image around, as the image cannot have negative coordinates
# and must be a rectangle. This means the origin point of any coordinates from an image is invalid, and the
# center of the bounding box of the rotated image is different from the center of the element's bounding box.
......
......@@ -49,7 +49,7 @@ class ClassificationMixin:
"CreateMLClass", id=self.corpus_id, body={"name": ml_class}
)
ml_class_id = self.classes[ml_class] = response["id"]
logger.debug(f"Created ML class {response['id']}")
logger.debug(f"Created a new ML class {response['id']}")
except ErrorResponse as e:
# Only reload for 400 errors
if e.status_code != 400:
......@@ -57,11 +57,11 @@ class ClassificationMixin:
# Reload and make sure we have the class
logger.info(
f"Reloading corpus classes to see if {ml_class} already exists"
f"Unable to create the ML class `{ml_class}`. Refreshing corpus classes cache."
)
self.load_corpus_classes()
assert ml_class in self.classes, (
"Missing class {ml_class} even after reloading"
f"Missing ML class {ml_class} even after refreshing."
)
ml_class_id = self.classes[ml_class]
......
......@@ -5,12 +5,12 @@ ElementsWorker methods for elements and element types.
import os
from collections.abc import Iterable
from operator import attrgetter
from typing import NamedTuple
from uuid import UUID
from warnings import warn
from peewee import IntegrityError
from arkindex.exceptions import ErrorResponse
from arkindex_worker import logger
from arkindex_worker.cache import CachedElement, CachedImage, unsupported_cache
from arkindex_worker.models import Element
......@@ -22,19 +22,10 @@ from arkindex_worker.utils import (
)
class ElementType(NamedTuple):
class MissingElementType(Exception):
"""
Arkindex Type of an element
"""
name: str
slug: str
is_folder: bool
class MissingTypeError(Exception):
"""
A required element type was not found in a corpus.
Raised when the specified element type was not found in the corpus and
the worker cannot create it.
"""
......@@ -71,57 +62,92 @@ class ElementMixin:
)
@unsupported_cache
def create_required_types(self, element_types: list[ElementType]):
"""Creates given element types in the corpus.
def create_element_type(
self, slug: str, name: str, is_folder: bool = False
) -> None:
"""
Create an element type on the given corpus.
:param element_types: The missing element types to create.
:param slug: Slug of the element type.
:param name: Name of the element type.
:param is_folder: Whether an element with this type can contain other elements or not.
"""
for element_type in element_types:
self.api_client.request(
assert slug and isinstance(slug, str), (
"slug shouldn't be null and should be of type str"
)
assert name and isinstance(name, str), (
"name shouldn't be null and should be of type str"
)
assert is_folder is not None and isinstance(is_folder, bool), (
"is_folder shouldn't be null and should be of type bool"
)
try:
element_type = self.api_client.request(
"CreateElementType",
body={
"slug": element_type.slug,
"display_name": element_type.name,
"folder": element_type.is_folder,
"slug": slug,
"display_name": name,
"folder": is_folder,
"corpus": self.corpus_id,
},
)
logger.info(f"Created a new element type with slug {element_type.slug}")
self.corpus_types[slug] = element_type
logger.info(f"Created a new element type with slug `{slug}`.")
except ErrorResponse as e:
# Only reload for 400 errors
if e.status_code != 400:
raise
# Reload and make sure we have the element type now
logger.warning(
f"Unable to create the element type `{slug}`. Refreshing corpus element types cache."
)
self.list_corpus_types()
assert slug in self.corpus_types, (
f"Missing element type `{slug}` even after refreshing."
)
def check_required_types(
self, *type_slugs: str, create_missing: bool = False
) -> bool:
self, type_slugs: list[str], create_missing: bool = False
) -> None:
"""
Check that a corpus has a list of required element types,
and raise an exception if any of them are missing.
Check that every element type needed is available in the corpus.
Missing ones may be created automatically if needed.
:param *type_slugs: Type slugs to look for.
:param create_missing: Whether missing types should be created.
:returns: Whether all of the specified type slugs have been found.
:raises MissingTypeError: If any of the specified type slugs were not found.
:param type_slugs: Element type slugs to search.
:param create_missing: Whether the missing types should be created. Defaults to False.
:raises MissingElementType: When an entity type is missing and cannot be created.
"""
assert len(type_slugs), "At least one element type slug is required."
assert all(isinstance(slug, str) for slug in type_slugs), (
"Element type slugs must be strings."
assert type_slugs and isinstance(type_slugs, list), (
"type_slugs shouldn't be null and should be of type list"
)
for index, slug in enumerate(type_slugs):
assert isinstance(slug, str), (
f"Element type at index {index} in type_slugs: Should be of type str"
)
assert create_missing is not None and isinstance(create_missing, bool), (
"create_missing shouldn't be null and should be of type bool"
)
if not self.corpus_types:
self.list_corpus_types()
missing_slugs = set(type_slugs) - set(self.corpus_types)
if missing_slugs:
if create_missing:
self.create_required_types(
element_types=[
ElementType(slug, slug, False) for slug in missing_slugs
],
)
else:
raise MissingTypeError(
f"Element {pluralize('type', len(missing_slugs))} {', '.join(sorted(missing_slugs))} were not found in corpus ({self.corpus_id})."
for slug in type_slugs:
# Do nothing if the type already exists
if slug in self.corpus_types:
continue
# Do not create missing if not requested
if not create_missing:
raise MissingElementType(
f"Element type `{slug}` was not in the corpus."
)
return True
# Create the type if non-existent
self.create_element_type(slug=slug, name=slug)
@unsupported_cache
def create_sub_element(
......
......@@ -8,6 +8,7 @@ from warnings import warn
from peewee import IntegrityError
from arkindex.exceptions import ErrorResponse
from arkindex_worker import logger
from arkindex_worker.cache import (
CachedEntity,
......@@ -34,24 +35,85 @@ class MissingEntityType(Exception):
class EntityMixin:
def list_corpus_entity_types(self):
"""
Loads available entity types in corpus.
"""
self.entity_types = {
entity_type["name"]: entity_type["id"]
for entity_type in self.api_client.paginate(
"ListCorpusEntityTypes", id=self.corpus_id
)
}
count = len(self.entity_types)
logger.info(
f"Loaded {count} entity {pluralize('type', count)} in corpus ({self.corpus_id})."
)
@unsupported_cache
def create_entity_type(self, name: str) -> None:
"""
Create an entity type on the given corpus.
:param name: Name of the entity type.
"""
assert name and isinstance(name, str), (
"name shouldn't be null and should be of type str"
)
try:
entity_type = self.api_client.request(
"CreateEntityType",
body={
"name": name,
"corpus": self.corpus_id,
},
)
self.entity_types[name] = entity_type["id"]
logger.info(f"Created a new entity type with name `{name}`.")
except ErrorResponse as e:
# Only reload for 400 errors
if e.status_code != 400:
raise
# Reload and make sure we have the element type now
logger.warning(
f"Unable to create the entity type `{name}`. Refreshing corpus entity types cache."
)
self.list_corpus_entity_types()
assert name in self.entity_types, (
f"Missing entity type `{name}` even after refreshing."
)
def check_required_entity_types(
self, entity_types: list[str], create_missing: bool = True
):
"""Checks that every entity type needed is available in the corpus.
) -> None:
"""
Check that every entity type needed is available in the corpus.
Missing ones may be created automatically if needed.
:param entity_types: Entity type names to search.
:param create_missing: Whether the missing types should be created. Defaults to True.
:raises MissingEntityType: When an entity type is missing and cannot create.
:raises MissingEntityType: When an entity type is missing and cannot be created.
"""
# Retrieve entity_type ID
assert entity_types and isinstance(entity_types, list), (
"entity_types shouldn't be null and should be of type list"
)
for index, entity_type in enumerate(entity_types):
assert isinstance(entity_type, str), (
f"Entity type at index {index} in entity_types: Should be of type str"
)
assert create_missing is not None and isinstance(create_missing, bool), (
"create_missing shouldn't be null and should be of type bool"
)
if not self.entity_types:
# Load entity_types of corpus
self.list_corpus_entity_types()
for entity_type in entity_types:
# Do nothing if type already exists
# Do nothing if the type already exists
if entity_type in self.entity_types:
continue
......@@ -61,15 +123,8 @@ class EntityMixin:
f"Entity type `{entity_type}` was not in the corpus."
)
# Create type if non-existent
self.entity_types[entity_type] = self.api_client.request(
"CreateEntityType",
body={
"name": entity_type,
"corpus": self.corpus_id,
},
)["id"]
logger.info(f"Created a new entity type with name `{entity_type}`.")
# Create the type if non-existent
self.create_entity_type(entity_type)
def create_entity(
self,
......@@ -211,6 +266,7 @@ class EntityMixin:
logger.warning(
f"Couldn't save created transcription entity in local cache: {e}"
)
return transcription_ent
@unsupported_cache
......@@ -387,18 +443,3 @@ class EntityMixin:
logger.info(
f"Loaded {count} {pluralize('entity', count)} in corpus ({self.corpus_id})"
)
def list_corpus_entity_types(self):
"""
Loads available entity types in corpus.
"""
self.entity_types = {
entity_type["name"]: entity_type["id"]
for entity_type in self.api_client.paginate(
"ListCorpusEntityTypes", id=self.corpus_id
)
}
count = len(self.entity_types)
logger.info(
f"Loaded {count} entity {pluralize('type', count)} in corpus ({self.corpus_id})."
)
......@@ -10,7 +10,7 @@ from arkindex_worker.cache import (
CachedImage,
)
from arkindex_worker.models import Element
from arkindex_worker.worker.element import MissingTypeError
from arkindex_worker.worker.element import MissingElementType
from tests import CORPUS_ID
from . import BASE_API_CALLS
......@@ -34,73 +34,247 @@ def test_list_corpus_types(responses, mock_elements_worker):
}
def test_check_required_types_argument_types(mock_elements_worker):
def test_create_element_type_wrong_slug(mock_elements_worker):
with pytest.raises(
AssertionError, match="At least one element type slug is required."
AssertionError, match="slug shouldn't be null and should be of type str"
):
mock_elements_worker.check_required_types()
mock_elements_worker.create_element_type(slug=None, name="page")
with pytest.raises(AssertionError, match="Element type slugs must be strings."):
mock_elements_worker.check_required_types("lol", 42)
with pytest.raises(
AssertionError, match="slug shouldn't be null and should be of type str"
):
mock_elements_worker.create_element_type(slug=1234, name="page")
def test_check_required_types(mock_elements_worker):
mock_elements_worker.corpus_types = {
"folder": {"slug": "folder"},
"page": {"slug": "page"},
}
def test_create_element_type_wrong_name(mock_elements_worker):
with pytest.raises(
AssertionError, match="name shouldn't be null and should be of type str"
):
mock_elements_worker.create_element_type(slug="page", name=None)
with pytest.raises(
AssertionError, match="name shouldn't be null and should be of type str"
):
mock_elements_worker.create_element_type(slug="page", name=1234)
assert mock_elements_worker.check_required_types("page")
assert mock_elements_worker.check_required_types("page", "folder")
def test_create_element_type_wrong_is_folder(mock_elements_worker):
with pytest.raises(
MissingTypeError,
match=re.escape(
"Element types act, text_line were not found in corpus (11111111-1111-1111-1111-111111111111)."
),
AssertionError, match="is_folder shouldn't be null and should be of type bool"
):
mock_elements_worker.create_element_type(
slug="page", name="page", is_folder=None
)
with pytest.raises(
AssertionError, match="is_folder shouldn't be null and should be of type bool"
):
assert mock_elements_worker.check_required_types("page", "text_line", "act")
mock_elements_worker.create_element_type(
slug="page", name="page", is_folder=1234
)
def test_create_element_type_api_error(responses, mock_elements_worker):
responses.add(
responses.POST,
"http://testserver/api/v1/elements/type/",
status=418,
)
with pytest.raises(ErrorResponse):
mock_elements_worker.create_element_type(slug="page", name="page")
assert len(responses.calls) == len(BASE_API_CALLS) + 1
assert [
(call.request.method, call.request.url) for call in responses.calls
] == BASE_API_CALLS + [("POST", "http://testserver/api/v1/elements/type/")]
def test_create_element_type_already_exists(responses, mock_elements_worker):
assert mock_elements_worker.corpus_types == {}
def test_check_required_types_create_missing(responses, mock_elements_worker):
mock_elements_worker.corpus_types = {
responses.add(
responses.POST,
"http://testserver/api/v1/elements/type/",
status=400,
match=[
matchers.json_params_matcher(
{
"slug": "page",
"display_name": "page",
"folder": False,
"corpus": CORPUS_ID,
}
)
],
)
responses.add(
responses.GET,
f"http://testserver/api/v1/corpus/{CORPUS_ID}/",
status=200,
json={
"id": CORPUS_ID,
"types": [{"slug": "folder"}, {"slug": "page"}],
},
)
mock_elements_worker.create_element_type(slug="page", name="page")
assert len(responses.calls) == len(BASE_API_CALLS) + 2
assert [
(call.request.method, call.request.url) for call in responses.calls
] == BASE_API_CALLS + [
("POST", "http://testserver/api/v1/elements/type/"),
("GET", f"http://testserver/api/v1/corpus/{CORPUS_ID}/"),
]
# Make sure the corpus_types attribute has been updated
assert mock_elements_worker.corpus_types == {
"folder": {"slug": "folder"},
"page": {"slug": "page"},
}
def test_create_element_type(responses, mock_elements_worker):
assert mock_elements_worker.corpus_types == {}
responses.add(
responses.POST,
"http://testserver/api/v1/elements/type/",
status=200,
match=[
matchers.json_params_matcher(
{
"slug": "text_line",
"display_name": "text_line",
"slug": "page",
"display_name": "page",
"folder": False,
"corpus": CORPUS_ID,
}
)
],
json={"id": "page-id", "slug": "page", "display_name": "page", "folder": False},
)
mock_elements_worker.create_element_type(slug="page", name="page")
assert len(responses.calls) == len(BASE_API_CALLS) + 1
assert [
(call.request.method, call.request.url) for call in responses.calls
] == BASE_API_CALLS + [
("POST", "http://testserver/api/v1/elements/type/"),
]
# Make sure the corpus_types attribute has been updated
assert mock_elements_worker.corpus_types == {
"page": {
"id": "page-id",
"slug": "page",
"display_name": "page",
"folder": False,
}
}
def test_check_required_types_wrong_type_slugs(mock_elements_worker):
with pytest.raises(
AssertionError, match="type_slugs shouldn't be null and should be of type list"
):
mock_elements_worker.check_required_types(type_slugs=None)
with pytest.raises(
AssertionError, match="type_slugs shouldn't be null and should be of type list"
):
mock_elements_worker.check_required_types(type_slugs=1234)
with pytest.raises(
AssertionError,
match="Element type at index 1 in type_slugs: Should be of type str",
):
mock_elements_worker.check_required_types(type_slugs=["page", 1234])
def test_check_required_types_wrong_create_missing(mock_elements_worker):
with pytest.raises(
AssertionError,
match="create_missing shouldn't be null and should be of type bool",
):
mock_elements_worker.check_required_types(
type_slugs=["page"], create_missing=None
)
with pytest.raises(
AssertionError,
match="create_missing shouldn't be null and should be of type bool",
):
mock_elements_worker.check_required_types(
type_slugs=["page"], create_missing=1234
)
def test_check_required_types_do_not_create_missing(responses, mock_elements_worker):
# Set one element type
mock_elements_worker.corpus_types = {"folder": {"slug": "folder"}}
with pytest.raises(
MissingElementType, match="Element type `page` was not in the corpus."
):
mock_elements_worker.check_required_types(
type_slugs=["folder", "page"], create_missing=False
)
assert len(responses.calls) == len(BASE_API_CALLS)
assert [
(call.request.method, call.request.url) for call in responses.calls
] == BASE_API_CALLS
def test_check_required_types(responses, mock_elements_worker):
# Set one element type
mock_elements_worker.corpus_types = {"folder": {"slug": "folder"}}
# Call to create a new element type
responses.add(
responses.POST,
"http://testserver/api/v1/elements/type/",
status=200,
match=[
matchers.json_params_matcher(
{
"slug": "act",
"display_name": "act",
"slug": "page",
"display_name": "page",
"folder": False,
"corpus": CORPUS_ID,
}
)
],
json={"id": "page-id", "slug": "page", "display_name": "page", "folder": False},
)
assert mock_elements_worker.check_required_types(
"page", "text_line", "act", create_missing=True
mock_elements_worker.check_required_types(
type_slugs=["folder", "page"], create_missing=True
)
assert len(responses.calls) == len(BASE_API_CALLS) + 1
assert [
(call.request.method, call.request.url) for call in responses.calls
] == BASE_API_CALLS + [
(
"POST",
"http://testserver/api/v1/elements/type/",
),
]
# Make sure the element_types attribute has been updated
assert mock_elements_worker.corpus_types == {
"folder": {"slug": "folder"},
"page": {
"id": "page-id",
"slug": "page",
"display_name": "page",
"folder": False,
},
}
@pytest.mark.parametrize(
("payload", "error"),
......
import pytest
from responses import matchers
from arkindex.exceptions import ErrorResponse
from arkindex_worker.models import Transcription
from arkindex_worker.worker.entity import MissingEntityType
from tests import CORPUS_ID
......@@ -8,67 +9,158 @@ from tests import CORPUS_ID
from . import BASE_API_CALLS
def test_check_required_entity_types(responses, mock_elements_worker):
# Set one entity type
mock_elements_worker.entity_types = {"person": "person-entity-type-id"}
def test_create_entity_type_wrong_name(mock_elements_worker):
with pytest.raises(
AssertionError, match="name shouldn't be null and should be of type str"
):
mock_elements_worker.create_entity_type(name=None)
checked_types = ["person", "new-entity"]
with pytest.raises(
AssertionError, match="name shouldn't be null and should be of type str"
):
mock_elements_worker.create_entity_type(name=1234)
# Call to create new entity type
def test_create_entity_type_api_error(responses, mock_elements_worker):
responses.add(
responses.POST,
"http://testserver/api/v1/entity/types/",
status=200,
status=418,
)
with pytest.raises(ErrorResponse):
mock_elements_worker.create_entity_type(name="firstname")
assert len(responses.calls) == len(BASE_API_CALLS) + 1
assert [
(call.request.method, call.request.url) for call in responses.calls
] == BASE_API_CALLS + [("POST", "http://testserver/api/v1/entity/types/")]
def test_create_entity_type_already_exists(responses, mock_elements_worker):
assert mock_elements_worker.entity_types == {}
responses.add(
responses.POST,
"http://testserver/api/v1/entity/types/",
status=400,
match=[
matchers.json_params_matcher(
{
"name": "new-entity",
"corpus": CORPUS_ID,
}
)
matchers.json_params_matcher({"name": "firstname", "corpus": CORPUS_ID})
],
)
responses.add(
responses.GET,
f"http://testserver/api/v1/corpus/{CORPUS_ID}/entity-types/",
status=200,
json={
"id": "new-entity-id",
"corpus": CORPUS_ID,
"name": "new-entity",
"color": "ffd1b3",
"count": 1,
"next": None,
"results": [
{"id": "lastname-id", "name": "lastname", "color": "ffd1b3"},
{"id": "firstname-id", "name": "firstname", "color": "ffd1b3"},
],
},
)
mock_elements_worker.check_required_entity_types(
entity_types=checked_types,
)
mock_elements_worker.create_entity_type(name="firstname")
# Make sure the entity_types entry has been updated
assert len(responses.calls) == len(BASE_API_CALLS) + 2
assert [
(call.request.method, call.request.url) for call in responses.calls
] == BASE_API_CALLS + [
("POST", "http://testserver/api/v1/entity/types/"),
("GET", f"http://testserver/api/v1/corpus/{CORPUS_ID}/entity-types/"),
]
# Make sure the entity_types attribute has been updated
assert mock_elements_worker.entity_types == {
"person": "person-entity-type-id",
"new-entity": "new-entity-id",
"lastname": "lastname-id",
"firstname": "firstname-id",
}
def test_create_entity_type(responses, mock_elements_worker):
assert mock_elements_worker.entity_types == {}
responses.add(
responses.POST,
"http://testserver/api/v1/entity/types/",
status=200,
match=[
matchers.json_params_matcher({"name": "firstname", "corpus": CORPUS_ID})
],
json={
"id": "firstname-id",
"name": "firstname",
"corpus": CORPUS_ID,
"color": "ffd1b3",
},
)
mock_elements_worker.create_entity_type(name="firstname")
assert len(responses.calls) == len(BASE_API_CALLS) + 1
assert [
(call.request.method, call.request.url) for call in responses.calls
] == BASE_API_CALLS + [
(
"POST",
"http://testserver/api/v1/entity/types/",
),
("POST", "http://testserver/api/v1/entity/types/"),
]
# Make sure the entity_types attribute has been updated
assert mock_elements_worker.entity_types == {"firstname": "firstname-id"}
def test_check_required_entity_types_no_creation_allowed(
def test_check_required_entity_types_wrong_entity_types(mock_elements_worker):
with pytest.raises(
AssertionError,
match="entity_types shouldn't be null and should be of type list",
):
mock_elements_worker.check_required_entity_types(entity_types=None)
with pytest.raises(
AssertionError,
match="entity_types shouldn't be null and should be of type list",
):
mock_elements_worker.check_required_entity_types(entity_types=1234)
with pytest.raises(
AssertionError,
match="Entity type at index 1 in entity_types: Should be of type str",
):
mock_elements_worker.check_required_entity_types(
entity_types=["firstname", 1234]
)
def test_check_required_entity_types_wrong_create_missing(mock_elements_worker):
with pytest.raises(
AssertionError,
match="create_missing shouldn't be null and should be of type bool",
):
mock_elements_worker.check_required_entity_types(
entity_types=["firstname"], create_missing=None
)
with pytest.raises(
AssertionError,
match="create_missing shouldn't be null and should be of type bool",
):
mock_elements_worker.check_required_entity_types(
entity_types=["firstname"], create_missing=1234
)
def test_check_required_entity_types_do_not_create_missing(
responses, mock_elements_worker
):
# Set one entity type
mock_elements_worker.entity_types = {"person": "person-entity-type-id"}
checked_types = ["person", "new-entity"]
mock_elements_worker.entity_types = {"lastname": "lastname-id"}
with pytest.raises(
MissingEntityType, match="Entity type `new-entity` was not in the corpus."
MissingEntityType, match="Entity type `firstname` was not in the corpus."
):
mock_elements_worker.check_required_entity_types(
entity_types=checked_types, create_missing=False
entity_types=["lastname", "firstname"], create_missing=False
)
assert len(responses.calls) == len(BASE_API_CALLS)
......@@ -77,6 +169,47 @@ def test_check_required_entity_types_no_creation_allowed(
] == BASE_API_CALLS
def test_check_required_entity_types(responses, mock_elements_worker):
# Set one entity type
mock_elements_worker.entity_types = {"lastname": "lastname-id"}
# Call to create a new entity type
responses.add(
responses.POST,
"http://testserver/api/v1/entity/types/",
status=200,
match=[
matchers.json_params_matcher({"name": "firstname", "corpus": CORPUS_ID})
],
json={
"id": "firstname-id",
"name": "firstname",
"corpus": CORPUS_ID,
"color": "ffd1b3",
},
)
mock_elements_worker.check_required_entity_types(
entity_types=["lastname", "firstname"], create_missing=True
)
assert len(responses.calls) == len(BASE_API_CALLS) + 1
assert [
(call.request.method, call.request.url) for call in responses.calls
] == BASE_API_CALLS + [
(
"POST",
"http://testserver/api/v1/entity/types/",
),
]
# Make sure the entity_types attribute has been updated
assert mock_elements_worker.entity_types == {
"lastname": "lastname-id",
"firstname": "firstname-id",
}
def test_list_transcription_entities_deprecation(fake_dummy_worker):
transcription = Transcription({"id": "fake_transcription_id"})
worker_version = "worker_version_id"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment