Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • workers/base-worker
1 result
Show changes
Commits on Source (20)
Showing
with 302 additions and 406 deletions
......@@ -4,7 +4,7 @@ stages:
- release
lint:
image: python:3.10
image: python:3
cache:
paths:
......@@ -55,7 +55,8 @@ test:
- tox -- --junitxml=test-report.xml --durations=50
test-cookiecutter:
image: python:3
# Needed till next release
image: python:3.11
stage: test
cache:
......@@ -91,7 +92,7 @@ test-cookiecutter:
- worker-demo/
build-cookiecutter:
image: docker:19.03.1
image: docker:24.0.6
stage: build
services:
- docker:dind
......
0.3.3
0.3.4
......@@ -102,14 +102,26 @@ class CachedElement(Model):
database = db
table_name = "elements"
def open_image(self, *args, max_size: Optional[int] = None, **kwargs) -> Image:
def open_image(
self,
*args,
max_width: Optional[int] = None,
max_height: Optional[int] = None,
**kwargs,
) -> Image:
"""
Open this element's image as a Pillow image.
This does not crop the image to the element's polygon.
IIIF servers with maxWidth, maxHeight or maxArea restrictions on image size are not supported.
Warns:
----
If both, ``max_width`` and ``max_height`` are set, the image ratio is not preserved.
:param *args: Positional arguments passed to [arkindex_worker.image.open_image][]
:param max_size: Subresolution of the image.
:param max_width: The maximum width of the image.
:param max_height: The maximum height of the image.
:param **kwargs: Keyword arguments passed to [arkindex_worker.image.open_image][]
:raises ValueError: When this element does not have an image ID or a polygon.
:return: A Pillow image.
......@@ -129,7 +141,7 @@ class CachedElement(Model):
else:
box = "full"
if max_size is None:
if max_width is None and max_height is None:
resize = "full"
else:
# Do not resize for polygons that do not exactly match the images
......@@ -141,14 +153,12 @@ class CachedElement(Model):
resize = "full"
# Do not resize when the image is below the maximum size
elif self.image.width <= max_size and self.image.height <= max_size:
elif (max_width is None or self.image.width <= max_width) and (
max_height is None or self.image.height <= max_height
):
resize = "full"
else:
ratio = max_size / max(self.image.width, self.image.height)
new_width, new_height = int(self.image.width * ratio), int(
self.image.height * ratio
)
resize = f"{new_width},{new_height}"
resize = f"{max_width or ''},{max_height or ''}"
url = self.image.url
if not url.endswith("/"):
......
......@@ -10,8 +10,6 @@ from typing import Generator, List, Optional
from PIL import Image
from requests import HTTPError
from arkindex_worker import logger
class MagicDict(dict):
"""
......@@ -124,9 +122,10 @@ class Element(MagicDict):
def open_image(
self,
*args,
max_size: Optional[int] = None,
max_width: Optional[int] = None,
max_height: Optional[int] = None,
use_full_image: Optional[bool] = False,
**kwargs
**kwargs,
) -> Image:
"""
Open this element's image using Pillow, rotating and mirroring it according
......@@ -149,7 +148,13 @@ class Element(MagicDict):
``rotation_angle=0, mirrored=False`` as keyword arguments.
:param max_size: The maximum size of the requested image.
Warns:
----
If both, ``max_width`` and ``max_height`` are set, the image ratio is not preserved.
:param max_width: The maximum width of the image.
:param max_height: The maximum height of the image.
:param use_full_image: Ignore the ``zone.polygon`` and always
retrieve the image without cropping.
:param *args: Positional arguments passed to [arkindex_worker.image.open_image][].
......@@ -165,41 +170,29 @@ class Element(MagicDict):
from arkindex_worker.image import (
download_tiles,
open_image,
polygon_bounding_box,
)
if not self.get("zone"):
raise ValueError("Element {} has no zone".format(self.id))
if self.requires_tiles:
if max_size is None:
if max_width is None and max_height is None:
return download_tiles(self.zone.image.url)
else:
raise NotImplementedError
if max_size is not None:
bounding_box = polygon_bounding_box(self.zone.polygon)
if max_width is None and max_height is None:
resize = "full"
else:
original_size = {"w": self.zone.image.width, "h": self.zone.image.height}
# No resizing if the element is smaller than the image.
if (
bounding_box.width != original_size["w"]
or bounding_box.height != original_size["h"]
):
resize = "full"
logger.warning(
"Only full image size elements covered, "
+ "downloading full size image."
)
# No resizing if the image is smaller than the wanted size.
elif original_size["w"] <= max_size and original_size["h"] <= max_size:
if (max_width is None or original_size["w"] <= max_width) and (
max_height is None or original_size["h"] <= max_height
):
resize = "full"
# Resizing if the image is bigger than the wanted size.
else:
ratio = max_size / max(original_size.values())
new_width, new_height = [int(x * ratio) for x in original_size.values()]
resize = "{},{}".format(new_width, new_height)
else:
resize = "full"
resize = f"{max_width or ''},{max_height or ''}"
if use_full_image:
url = self.image_url(resize)
......@@ -212,7 +205,7 @@ class Element(MagicDict):
*args,
rotation_angle=self.rotation_angle,
mirrored=self.mirrored,
**kwargs
**kwargs,
)
except HTTPError as e:
if (
......
# -*- coding: utf-8 -*-
"""
Generator for the ``ml_report.json`` file, to report created worker results and exceptions.
"""
import json
import traceback
from collections import Counter
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Union
from uuid import UUID
from apistar.exceptions import ErrorResponse
from arkindex_worker import logger
from arkindex_worker.models import Transcription
class Reporter(object):
"""
Helper to generate an ``ml_report.json`` artifact.
"""
def __init__(
self,
name: Optional[str] = "Unknown worker",
slug: Optional[str] = "unknown-slug",
version: Optional[str] = None,
**kwargs,
):
self.report_data = {
"name": name,
"slug": slug,
"version": version,
"started": datetime.utcnow().isoformat(),
"elements": {},
}
logger.info(f"Starting ML report for {name}")
def __repr__(self):
return "{}({})".format(self.__class__.__name__, self.report_data["slug"])
def _get_element(self, element_id):
return self.report_data["elements"].setdefault(
str(element_id),
{
"started": datetime.utcnow().isoformat(),
# Created element counts, by type slug
"elements": {},
# Created transcriptions count
"transcriptions": 0,
# Created classification counts, by class
"classifications": {},
# Created entities ({"id": "", "type": "", "name": ""}) from this element
"entities": [],
# Created transcription entities ({"transcription_id": "", "entity_id": ""}) from this element
"transcription_entities": [],
# Created metadata ({"id": "", "type": "", "name": ""}) from this element
"metadata": [],
"errors": [],
},
)
def process(self, element_id: Union[str, UUID]):
"""
Report that a specific element ID is being processed.
:param element_id: ID of the element being processed.
"""
# Just call the element initializer
self._get_element(element_id)
def add_element(self, parent_id: Union[str, UUID], type: str, type_count: int = 1):
"""
Report creating an element as a child of another.
:param parent_id: ID of the parent element.
:param type: Slug of the type of the child element.
:param type_count: How many elements of this type were created.
"""
elements = self._get_element(parent_id)["elements"]
elements.setdefault(type, 0)
elements[type] += type_count
def add_classification(self, element_id: Union[str, UUID], class_name: str):
"""
Report creating a classification on an element.
:param element_id: ID of the element.
:param class_name: Name of the ML class of the new classification.
"""
classifications = self._get_element(element_id)["classifications"]
classifications.setdefault(class_name, 0)
classifications[class_name] += 1
def add_classifications(
self, element_id: Union[str, UUID], classifications: List[Dict[str, str]]
):
"""
Report creating one or more classifications at once on an element.
:param element_id: ID of the element.
:param classifications: List of classifications.
Each classification is represented as a ``dict`` with a ``class_name`` key
holding the name of the ML class being used.
"""
assert isinstance(
classifications, list
), "A list is required for classifications"
element = self._get_element(element_id)
# Retrieve the previous existing classification counts, if any
counter = Counter(**element["classifications"])
# Add the new ones
counter.update(
[classification["class_name"] for classification in classifications]
)
element["classifications"] = dict(counter)
def add_transcription(self, element_id: Union[str, UUID], count=1):
"""
Report creating a transcription on an element.
:param element_id: ID of the element.
:param count: Number of transcriptions created at once
"""
self._get_element(element_id)["transcriptions"] += count
def add_entity(
self,
element_id: Union[str, UUID],
entity_id: Union[str, UUID],
type: str,
name: str,
):
"""
Report creating an entity on an element.
:param element_id: ID of the element.
:param entity_id: ID of the new entity.
:param type: Type of the entity.
:param name: Name of the entity.
"""
entities = self._get_element(element_id)["entities"]
entities.append({"id": entity_id, "type": type, "name": name})
def add_transcription_entity(
self,
entity_id: Union[str, UUID],
transcription: Transcription,
transcription_entity_id: Union[str, UUID],
):
"""
Report creating a transcription entity on an element.
:param entity_id: ID of the entity element.
:param transcription: Transcription to add the entity on
:param transcription_entity_id: ID of the transcription entity that is created.
"""
transcription_entities = self._get_element(transcription.element.id)[
"transcription_entities"
]
transcription_entities.append(
{
"transcription_id": transcription.id,
"entity_id": entity_id,
"transcription_entity_id": transcription_entity_id,
}
)
def add_entity_link(self, *args, **kwargs):
"""
Report creating an entity link. Not currently supported.
:raises NotImplementedError:
"""
raise NotImplementedError
def add_entity_role(self, *args, **kwargs):
"""
Report creating an entity role. Not currently supported.
:raises NotImplementedError:
"""
raise NotImplementedError
def add_metadata(
self,
element_id: Union[str, UUID],
metadata_id: Union[str, UUID],
type: str,
name: str,
):
"""
Report creating a metadata from an element.
:param element_id: ID of the element.
:param metadata_id: ID of the new metadata.
:param type: Type of the metadata.
:param name: Name of the metadata.
"""
metadata = self._get_element(element_id)["metadata"]
metadata.append({"id": metadata_id, "type": type, "name": name})
def error(self, element_id: Union[str, UUID], exception: Exception):
"""
Report that a Python exception occurred when processing an element.
:param element_id: ID of the element.
:param exception: A Python exception.
"""
error_data = {
"class": exception.__class__.__name__,
"message": str(exception),
}
if exception.__traceback__ is not None:
error_data["traceback"] = "\n".join(
traceback.format_tb(exception.__traceback__)
)
if isinstance(exception, ErrorResponse):
error_data["message"] = exception.title
error_data["status_code"] = exception.status_code
error_data["content"] = exception.content
self._get_element(element_id)["errors"].append(error_data)
def save(self, path: Union[str, Path]):
"""
Save the ML report to the specified path.
:param path: Path to save the ML report to.
"""
logger.info(f"Saving ML report to {path}")
with open(path, "w") as f:
json.dump(self.report_data, f)
......@@ -15,7 +15,6 @@ from apistar.exceptions import ErrorResponse
from arkindex_worker import logger
from arkindex_worker.cache import CachedElement
from arkindex_worker.models import Element
from arkindex_worker.reporting import Reporter
from arkindex_worker.worker.base import BaseWorker
from arkindex_worker.worker.classification import ClassificationMixin
from arkindex_worker.worker.element import ElementMixin
......@@ -156,17 +155,12 @@ class ElementsWorker(
super().configure()
super().configure_cache()
# Add report concerning elements
self.report = Reporter(
**self.worker_details, version=getattr(self, "worker_version_id", None)
)
def run(self):
"""
Implements an Arkindex worker that goes through each element returned by
[list_elements][arkindex_worker.worker.ElementsWorker.list_elements]. It calls [process_element][arkindex_worker.worker.ElementsWorker.process_element], catching exceptions
and reporting them using the [Reporter][arkindex_worker.reporting.Reporter], and handles saving the report
once the process is complete as well as WorkerActivity updates when enabled.
[list_elements][arkindex_worker.worker.ElementsWorker.list_elements].
It calls [process_element][arkindex_worker.worker.ElementsWorker.process_element],
catching exceptions, and handles saving WorkerActivity updates when enabled.
"""
self.configure()
......@@ -232,10 +226,6 @@ class ElementsWorker(
self.update_activity(element.id, ActivityState.Error)
except Exception:
pass
self.report.error(element_id, e)
# Save report as local artifact
self.report.save(self.work_dir / "ml_report.json")
if failed:
logger.error(
......
......@@ -140,7 +140,7 @@ class BaseWorker(object):
self.process_information = None
# corpus_id will be updated in configure() using the worker_run's corpus
# or in configure_for_developers() from the environment
self.corpus_id = None
self._corpus_id = None
self.user_configuration = {}
self.model_configuration = {}
self.support_cache = support_cache
......@@ -155,6 +155,17 @@ class BaseWorker(object):
# Define API Client
self.setup_api_client()
@property
def corpus_id(self) -> str:
"""
ID of the corpus on which the worker is executed.
Has to be set through the `ARKINDEX_CORPUS_ID` variable in **read-only** mode.
Raises an Exception when trying to access when unset.
"""
if not self._corpus_id:
raise Exception("Missing ARKINDEX_CORPUS_ID environment variable")
return self._corpus_id
@property
def is_read_only(self) -> bool:
"""
......@@ -199,11 +210,7 @@ class BaseWorker(object):
logger.warning("Running without any extra configuration")
# Define corpus_id from environment
self.corpus_id = os.environ.get("ARKINDEX_CORPUS_ID")
if not self.corpus_id:
logger.warning(
"'ARKINDEX_CORPUS_ID' was not set in the environment. Any API request involving a `corpus_id` will fail."
)
self._corpus_id = os.environ.get("ARKINDEX_CORPUS_ID")
# Define model_version_id from environment
self.model_version_id = os.environ.get("ARKINDEX_MODEL_VERSION_ID")
......@@ -229,7 +236,7 @@ class BaseWorker(object):
self.process_information = worker_run["process"]
# Load corpus id
self.corpus_id = worker_run["process"]["corpus"]
self._corpus_id = worker_run["process"]["corpus"]
# Load worker version information
worker_version = worker_run["worker_version"]
......
......@@ -176,8 +176,6 @@ class ClassificationMixin(object):
# Propagate any other API error
raise
self.report.add_classification(element.id, ml_class)
return created
def create_classifications(
......@@ -248,7 +246,6 @@ class ClassificationMixin(object):
for created_cl in created_cls:
created_cl["class_name"] = self.retrieve_ml_class(created_cl["ml_class"])
self.report.add_classification(element.id, created_cl["class_name"])
if self.use_cache:
# Store classifications in local cache
......
......@@ -3,6 +3,7 @@
ElementsWorker methods for elements and element types.
"""
from typing import Dict, Iterable, List, NamedTuple, Optional, Union
from uuid import UUID
from peewee import IntegrityError
......@@ -141,7 +142,6 @@ class ElementMixin(object):
"confidence": confidence,
},
)
self.report.add_element(element.id, type)
return sub_element["id"] if slim_output else sub_element
......@@ -237,9 +237,6 @@ class ElementMixin(object):
},
)
for element in elements:
self.report.add_element(parent.id, element["type"])
if self.use_cache:
# Create the image as needed and handle both an Element and a CachedElement
if isinstance(parent, CachedElement):
......@@ -275,33 +272,39 @@ class ElementMixin(object):
return created_ids
def update_element(
self,
element: Union[Element, CachedElement],
type: Optional[str] = None,
name: Optional[str] = None,
polygon: Optional[List[List[Union[int, float]]]] = None,
confidence: Optional[float] = None,
def partial_update_element(
self, element: Union[Element, CachedElement], **kwargs
) -> dict:
"""
Partially update an element through the API.
Partially updates an element through the API.
:param element: The element to update.
:param type: Optional new slug type of the element.
:param name: Optional new name of the element.
:param polygon: Optional new polygon of the element.
:param confidence: Optional new confidence score, between 0.0 and 1.0.
:param **kwargs:
* *type* (``str``): Optional slug type of the element.
* *name* (``str``): Optional name of the element.
* *polygon* (``list``): Optional polygon for this element
* *confidence* (``float``): Optional confidence score of this element
* *rotation_angle* (``int``): Optional rotation angle of this element
* *mirrored* (``bool``): Optional mirror status of this element
* *image* (``UUID``): Optional ID of the image of this element
:returns: A dict from the ``PartialUpdateElement`` API endpoint,
"""
assert element and isinstance(
element, (Element, CachedElement)
), "element shouldn't be null and should be an Element or CachedElement"
assert type is None or isinstance(type, str), "type should be None or a str"
assert name is None or isinstance(name, str), "name should be None or a str"
assert polygon is None or isinstance(
polygon, list
), "polygon should be None or a list"
if polygon:
if "type" in kwargs:
assert isinstance(kwargs["type"], str), "type should be a str"
if "name" in kwargs:
assert isinstance(kwargs["name"], str), "name should be a str"
if "polygon" in kwargs:
polygon = kwargs["polygon"]
assert isinstance(polygon, list), "polygon should be a list"
assert len(polygon) >= 3, "polygon should have at least three points"
assert all(
isinstance(point, list) and len(point) == 2 for point in polygon
......@@ -309,9 +312,27 @@ class ElementMixin(object):
assert all(
isinstance(coord, (int, float)) for point in polygon for coord in point
), "polygon points should be lists of two numbers"
assert confidence is None or (
isinstance(confidence, float) and 0 <= confidence <= 1
), "confidence should be None or a float in [0..1] range"
if "confidence" in kwargs:
confidence = kwargs["confidence"]
assert confidence is None or (
isinstance(confidence, float) and 0 <= confidence <= 1
), "confidence should be None or a float in [0..1] range"
if "rotation_angle" in kwargs:
rotation_angle = kwargs["rotation_angle"]
assert (
isinstance(rotation_angle, int) and rotation_angle >= 0
), "rotation_angle should be a positive integer"
if "mirrored" in kwargs:
assert isinstance(kwargs["mirrored"], bool), "mirrored should be a boolean"
if "image" in kwargs:
image = kwargs["image"]
assert isinstance(image, UUID), "image should be a UUID"
# Cast to string
kwargs["image"] = str(image)
if self.is_read_only:
logger.warning("Cannot update element as this worker is in read-only mode")
......@@ -320,22 +341,24 @@ class ElementMixin(object):
updated_element = self.request(
"PartialUpdateElement",
id=element.id,
body={
"type": type,
"name": name,
"polygon": polygon,
"confidence": confidence,
},
body=kwargs,
)
if self.use_cache:
CachedElement.update(
{
CachedElement.type: type,
CachedElement.polygon: str(polygon),
CachedElement.confidence: confidence,
}
).where(CachedElement.id == element.id).execute()
# Name is not present in CachedElement model
kwargs.pop("name", None)
# Stringify polygon if present
if "polygon" in kwargs:
kwargs["polygon"] = str(kwargs["polygon"])
# Retrieve the right image
if "image" in kwargs:
kwargs["image"] = CachedImage.get_by_id(kwargs["image"])
CachedElement.update(**kwargs).where(
CachedElement.id == element.id
).execute()
return updated_element
......@@ -497,3 +520,165 @@ class ElementMixin(object):
)
return children
def list_element_parents(
self,
element: Union[Element, CachedElement],
folder: Optional[bool] = None,
name: Optional[str] = None,
recursive: Optional[bool] = None,
transcription_worker_version: Optional[Union[str, bool]] = None,
transcription_worker_run: Optional[Union[str, bool]] = None,
type: Optional[str] = None,
with_classes: Optional[bool] = None,
with_corpus: Optional[bool] = None,
with_metadata: Optional[bool] = None,
with_has_children: Optional[bool] = None,
with_zone: Optional[bool] = None,
worker_version: Optional[Union[str, bool]] = None,
worker_run: Optional[Union[str, bool]] = None,
) -> Union[Iterable[dict], Iterable[CachedElement]]:
"""
List parents of an element.
:param element: Child element to find parents of.
:param folder: Restrict to or exclude elements with folder types.
This parameter is not supported when caching is enabled.
:param name: Restrict to elements whose name contain a substring (case-insensitive).
This parameter is not supported when caching is enabled.
:param recursive: Look for elements recursively (grand-children, etc.)
This parameter is not supported when caching is enabled.
:param transcription_worker_version: Restrict to elements that have a transcription created by a worker version with this UUID.
This parameter is not supported when caching is enabled.
:param transcription_worker_run: Restrict to elements that have a transcription created by a worker run with this UUID.
This parameter is not supported when caching is enabled.
:param type: Restrict to elements with a specific type slug
This parameter is not supported when caching is enabled.
:param with_classes: Include each element's classifications in the response.
This parameter is not supported when caching is enabled.
:param with_corpus: Include each element's corpus in the response.
This parameter is not supported when caching is enabled.
:param with_has_children: Include the ``has_children`` attribute in the response,
indicating if this element has child elements of its own.
This parameter is not supported when caching is enabled.
:param with_metadata: Include each element's metadata in the response.
This parameter is not supported when caching is enabled.
:param with_zone: Include the ``zone`` attribute in the response,
holding the element's image and polygon.
This parameter is not supported when caching is enabled.
:param worker_version: Restrict to elements created by a worker version with this UUID.
:param worker_run: Restrict to elements created by a worker run with this UUID.
:return: An iterable of dicts from the ``ListElementParents`` API endpoint,
or an iterable of [CachedElement][arkindex_worker.cache.CachedElement] when caching is enabled.
"""
assert element and isinstance(
element, (Element, CachedElement)
), "element shouldn't be null and should be an Element or CachedElement"
query_params = {}
if folder is not None:
assert isinstance(folder, bool), "folder should be of type bool"
query_params["folder"] = folder
if name:
assert isinstance(name, str), "name should be of type str"
query_params["name"] = name
if recursive is not None:
assert isinstance(recursive, bool), "recursive should be of type bool"
query_params["recursive"] = recursive
if transcription_worker_version is not None:
assert isinstance(
transcription_worker_version, (str, bool)
), "transcription_worker_version should be of type str or bool"
if isinstance(transcription_worker_version, bool):
assert (
transcription_worker_version is False
), "if of type bool, transcription_worker_version can only be set to False"
query_params["transcription_worker_version"] = transcription_worker_version
if transcription_worker_run is not None:
assert isinstance(
transcription_worker_run, (str, bool)
), "transcription_worker_run should be of type str or bool"
if isinstance(transcription_worker_run, bool):
assert (
transcription_worker_run is False
), "if of type bool, transcription_worker_run can only be set to False"
query_params["transcription_worker_run"] = transcription_worker_run
if type:
assert isinstance(type, str), "type should be of type str"
query_params["type"] = type
if with_classes is not None:
assert isinstance(with_classes, bool), "with_classes should be of type bool"
query_params["with_classes"] = with_classes
if with_corpus is not None:
assert isinstance(with_corpus, bool), "with_corpus should be of type bool"
query_params["with_corpus"] = with_corpus
if with_has_children is not None:
assert isinstance(
with_has_children, bool
), "with_has_children should be of type bool"
query_params["with_has_children"] = with_has_children
if with_metadata is not None:
assert isinstance(
with_metadata, bool
), "with_metadata should be of type bool"
query_params["with_metadata"] = with_metadata
if with_zone is not None:
assert isinstance(with_zone, bool), "with_zone should be of type bool"
query_params["with_zone"] = with_zone
if worker_version is not None:
assert isinstance(
worker_version, (str, bool)
), "worker_version should be of type str or bool"
if isinstance(worker_version, bool):
assert (
worker_version is False
), "if of type bool, worker_version can only be set to False"
query_params["worker_version"] = worker_version
if worker_run is not None:
assert isinstance(
worker_run, (str, bool)
), "worker_run should be of type str or bool"
if isinstance(worker_run, bool):
assert (
worker_run is False
), "if of type bool, worker_run can only be set to False"
query_params["worker_run"] = worker_run
if self.use_cache:
# Checking that we only received query_params handled by the cache
assert set(query_params.keys()) <= {
"type",
"worker_version",
"worker_run",
}, "When using the local cache, you can only filter by 'type' and/or 'worker_version' and/or 'worker_run'"
parent_ids = CachedElement.select(CachedElement.parent_id).where(
CachedElement.id == element.id
)
query = CachedElement.select().where(CachedElement.id.in_(parent_ids))
if type:
query = query.where(CachedElement.type == type)
if worker_version is not None:
# If worker_version=False, filter by manual worker_version e.g. None
worker_version_id = worker_version or None
if worker_version_id:
query = query.where(
CachedElement.worker_version_id == worker_version_id
)
else:
query = query.where(CachedElement.worker_version_id.is_null())
if worker_run is not None:
# If worker_run=False, filter by manual worker_run e.g. None
worker_run_id = worker_run or None
if worker_run_id:
query = query.where(CachedElement.worker_run_id == worker_run_id)
else:
query = query.where(CachedElement.worker_run_id.is_null())
return query
else:
parents = self.api_client.paginate(
"ListElementParents", id=element.id, **query_params
)
return parents
......@@ -8,7 +8,7 @@ from typing import Dict, List, Optional, TypedDict, Union
from peewee import IntegrityError
from arkindex_worker import logger
from arkindex_worker.cache import CachedElement, CachedEntity, CachedTranscriptionEntity
from arkindex_worker.cache import CachedEntity, CachedTranscriptionEntity
from arkindex_worker.models import Element, Transcription
Entity = TypedDict(
......@@ -68,7 +68,6 @@ class EntityMixin(object):
def create_entity(
self,
element: Union[Element, CachedElement],
name: str,
type: str,
metas=dict(),
......@@ -78,14 +77,9 @@ class EntityMixin(object):
Create an entity on the given corpus.
If cache support is enabled, a [CachedEntity][arkindex_worker.cache.CachedEntity] will also be created.
:param element: An element on which the entity will be reported with the [Reporter][arkindex_worker.reporting.Reporter].
This does not have any effect on the entity itself.
:param name: Name of the entity.
:param type: Type of the entity.
"""
assert element and isinstance(
element, (Element, CachedElement)
), "element shouldn't be null and should be an Element or CachedElement"
assert name and isinstance(
name, str
), "name shouldn't be null and should be of type str"
......@@ -119,7 +113,6 @@ class EntityMixin(object):
"worker_run_id": self.worker_run_id,
},
)
self.report.add_entity(element.id, entity["id"], entity_type_id, name)
if self.use_cache:
# Store entity in local cache
......@@ -196,7 +189,6 @@ class EntityMixin(object):
id=transcription.id,
body=body,
)
self.report.add_transcription_entity(entity, transcription, transcription_ent)
if self.use_cache:
# Store transcription entity in local cache
......@@ -247,11 +239,6 @@ class EntityMixin(object):
transcription, Transcription
), "transcription shouldn't be null and should be of type Transcription"
# Needed for MLreport
assert (
hasattr(transcription, "element") and transcription.element
), f"No element linked to {transcription}"
assert entities and isinstance(
entities, list
), "entities shouldn't be null and should be of type list"
......@@ -301,22 +288,6 @@ class EntityMixin(object):
},
)
for entity, created_objects in zip(entities, created_ids["entities"]):
# Report entity creation
self.report.add_entity(
transcription.element.id,
created_objects["entity_id"],
entity.get("type_id"),
entity.get("name"),
)
# Report transcription entity creation
self.report.add_transcription_entity(
created_objects["entity_id"],
transcription,
created_objects["transcription_entity_id"],
)
return created_ids["entities"]
def list_transcription_entities(
......
......@@ -105,7 +105,6 @@ class MetaDataMixin(object):
"worker_run_id": self.worker_run_id,
},
)
self.report.add_metadata(element.id, metadata["id"], type.value, name)
return metadata["id"]
......@@ -182,9 +181,6 @@ class MetaDataMixin(object):
},
)["metadata_list"]
for meta in created_metadatas:
self.report.add_metadata(element.id, meta["id"], meta["type"], meta["name"])
return created_metadatas
def list_element_metadata(
......
......@@ -88,8 +88,6 @@ class TranscriptionMixin(object):
},
)
self.report.add_transcription(element.id)
if self.use_cache:
# Store transcription in local cache
try:
......@@ -181,9 +179,6 @@ class TranscriptionMixin(object):
},
)["transcriptions"]
for created_tr in created_trs:
self.report.add_transcription(created_tr["element_id"])
if self.use_cache:
# Store transcriptions in local cache
try:
......@@ -308,8 +303,6 @@ class TranscriptionMixin(object):
logger.debug(
f"A sub_element of {element.id} with type {sub_element_type} was created during transcriptions bulk creation"
)
self.report.add_element(element.id, sub_element_type)
self.report.add_transcription(annotation["element_id"])
if self.use_cache:
# Store transcriptions and their associated element (if created) in local cache
......
black==23.7.0
black==23.9.1
doc8==1.1.1
mkdocs==1.5.2
mkdocs==1.5.3
mkdocs-material==9.1.21
mkdocstrings==0.22.0
mkdocstrings-python==1.3.0
......
......@@ -83,20 +83,6 @@ The multiple configuration sources from the Arkindex-mode are merged into a uniq
One information cannot be retrieved directly from the configuration file and is required in some cases: the ID of the Arkindex corpus which the elements processed belong to. This is retrieved via the `ARKINDEX_CORPUS_ID` environment variable.
## Worker reporter
At the end of a worker execution, a report about the publication done by the worker is generated in JSON-format. This lists
- the starting time,
- the number of elements created, grouped by type,
- the number of transcription created,
- the number of classifications created, grouped by class,
- the number of entities created,
- the number of entities created on transcriptions,
- the number of metadatas created,
- the encountered errors' logs.
This is done by the many helper described in the [reporting module](../../ref/reporting.md). They use the `report` attribute initialized at the configuration stage.
## Setting Debug logging level
There are three ways to activate the debug mode:
......@@ -132,9 +118,6 @@ Many attributes are set on the worker during at the configuration stage. Here is
`process_information`
: The details about the process parent to this worker execution. Only set in Arkindex mode.
`reporter`
: The `Reporter` instance that will generate the `ml_report.json` artifacts which sums up the publication done during this execution and the errors encountered.
`secrets`
: A dictionary mapping the secret name to their parsed content.
......
......@@ -28,7 +28,6 @@ flowchart LR
subgraph id3[Loop over each element]
element_processing --> element_processing
end
element_processing -- Save ML report to disk --> reporting
end
init --> run
end
......
# Reporting
::: arkindex_worker.reporting
# Releases
## 0.3.4
Released on **14 Sept 2023** &bull; View on [Gitlab](https://gitlab.teklia.com/workers/base-worker/-/releases/0.3.4)
- The worker template was updated to correctly install [Git submodules](https://git-scm.com/book/en/v2/Git-Tools-Submodules) if it depends on any.
- Base-worker now uses [ruff](https://github.com/charliermarsh/ruff) for linting. This tool replaces `isort` and `flake8`.
- New Arkindex API helper to update an element, calling [PartialUpdateElement](https://demo.arkindex.org/api-docs/#tag/elements/operation/PartialUpdateElement).
- New Arkindex API helper to list an element's parents, calling [ListElementParents](https://demo.arkindex.org/api-docs/#tag/elements/operation/ListElementParents).
- Worker Activity API is now disabled when the worker runs in `read-only` mode instead of relying on the `--dev` CLI argument. The [update_activity](https://workers.arkindex.org/ref/elements_worker/#arkindex_worker.worker.ElementsWorker.update_activity) API helper was updated following Arkindex 1.5.1 changes.
- Worker can now resize the image of an element when opening them. This uses the [IIIF](https://iiif.io/api/image/2.1/#size) resizing API.
## 0.3.3
......
......@@ -88,7 +88,6 @@ nav:
- Models: ref/models.md
- Git & Gitlab support: ref/git.md
- Image utilities: ref/image.md
- Reporting: ref/reporting.md
- Cache: ref/cache.md
- Utils: ref/utils.md
- Releases: releases.md
......
arkindex-client==1.0.13
arkindex-client==1.0.14
peewee==3.16.3
Pillow==10.0.0
pymdown-extensions==10.2
Pillow==10.1.0
pymdown-extensions==10.3
python-gitlab==3.15.0
python-gnupg==0.5.1
sh==2.0.6
shapely==2.0.1
shapely==2.0.2
tenacity==8.2.3
zstandard==0.21.0
pytest==7.4.0
pytest==7.4.2
pytest-mock==3.11.1
pytest-responses==0.5.1