Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • workers/base-worker
1 result
Show changes
Commits on Source (2)
Showing with 666 additions and 91 deletions
......@@ -164,8 +164,57 @@ pages:
stage: release
only:
- master
- tags
docs-deploy:
image: node:16
stage: release
dependencies:
- docs-build
before_script:
- npm install -g surge
except:
- master
- tags
- schedules
environment:
name: ${CI_COMMIT_REF_SLUG}
url: https://${CI_COMMIT_REF_SLUG}-base-worker-arkindex.surge.sh
on_stop: docs-stop-surge
script:
- surge public ${CI_ENVIRONMENT_URL}
docs-stop-surge:
image: node:16
stage: release
when: manual
# Do not try to checkout the branch if it was deleted
variables:
GIT_STRATEGY: none
except:
- master
- tags
- schedules
environment:
name: ${CI_COMMIT_REF_SLUG}
url: https://${CI_COMMIT_REF_SLUG}-base-worker-arkindex.surge.sh
action: stop
before_script:
- npm install -g surge
script:
- surge teardown ${CI_ENVIRONMENT_URL}
release-notes:
stage: release
image: registry.gitlab.com/teklia/devops:latest
......
# -*- coding: utf-8 -*-
"""
Database mappings and helper methods for the experimental worker caching feature.
On methods that support caching, the database will be used for all reads,
and writes will go both to the Arkindex API and the database,
reducing network usage.
"""
import json
import os
import sqlite3
......@@ -26,6 +34,10 @@ db = SqliteDatabase(None)
class JSONField(Field):
"""
A Peewee field that stores a JSON payload as a string and parses it automatically.
"""
field_type = "text"
def db_value(self, value):
......@@ -40,6 +52,11 @@ class JSONField(Field):
class Version(Model):
"""
Cache version table, used to warn about incompatible cache databases
when a worker uses an outdated version of ``base-worker``.
"""
version = IntegerField(primary_key=True)
class Meta:
......@@ -80,7 +97,12 @@ class CachedElement(Model):
This does not crop the image to the element's polygon.
IIIF servers with maxWidth, maxHeight or maxArea restrictions on image size are not supported.
:param \\*args: Positional arguments passed to :meth:`arkindex_worker.image.open_image`
:param max_size: Subresolution of the image.
:type max_size: int or None
:param \\**kwargs: Keyword arguments passed to :meth:`arkindex_worker.image.open_image`
:raises ValueError: When this element does not have an image ID or a polygon.
:returns PIL.Image: A Pillow image.
"""
if not self.image_id or not self.polygon:
raise ValueError(f"Element {self.id} has no image")
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Helper classes for workers that interact with Git repositories and the GitLab API.
"""
import shutil
import time
from datetime import datetime
......
# -*- coding: utf-8 -*-
"""
Helper methods to download and open IIIF images, and manage polygons.
"""
import os
from collections import namedtuple
from io import BytesIO
......@@ -25,7 +29,20 @@ BoundingBox = namedtuple("BoundingBox", ["x", "y", "width", "height"])
def open_image(path, mode="RGB", rotation_angle=0, mirrored=False):
"""
Open an image from a path or a URL
Open an image from a path or a URL.
.. warning:: Prefer :meth:`Element.open_image` whenever possible.
:param path str: Path or URL to open the image from.
This parameter will be interpreted as a URL when it has a `http` or `https` scheme
and no file exist with this path locally.
:param mode str: Pillow mode for the image. See `the Pillow documentation
<https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes>`_.
:param int rotation_angle: Rotation angle to apply to the image, in degrees.
If it is not a multiple of 90°, then the rotation can cause empty pixels of
the mode's default color to be added for padding.
:param bool mirrored: Whether or not to mirror the image horizontally.
:returns PIL.Image: A Pillow image.
"""
if (
path.startswith("http://")
......@@ -53,7 +70,10 @@ def open_image(path, mode="RGB", rotation_angle=0, mirrored=False):
def download_image(url):
"""
Download an image and open it with Pillow
Download an image and open it with Pillow.
:param url str: URL of the image.
:returns PIL.Image: A Pillow image.
"""
assert url.startswith("http"), "Image URL must be HTTP(S)"
......@@ -89,6 +109,13 @@ def download_image(url):
def polygon_bounding_box(polygon):
"""
Compute the rectangle bounding box of a polygon.
:param polygon: Polygon to get the bounding box of.
:type polygon: list(list(int or float))
:returns BoundingBox: Bounding box of this polygon.
"""
x_coords, y_coords = zip(*polygon)
x, y = min(x_coords), min(y_coords)
width, height = max(x_coords) - x, max(y_coords) - y
......@@ -117,7 +144,10 @@ def _retried_request(url):
def download_tiles(url):
"""
Reconstruct a full IIIF image on servers that cannot serve the full-sized image using tiles.
Reconstruct a full IIIF image on servers that cannot serve the full-sized image, using tiles.
:param str url: URL of the image.
:returns PIL.Image: A Pillow image.
"""
if not url.endswith("/"):
url += "/"
......@@ -187,12 +217,19 @@ def download_tiles(url):
def trim_polygon(polygon, image_width: int, image_height: int):
"""
This method takes as input:
- a polygon: a list or tuple of points
- image_width, image_height: an image's dimensions
and outputs a new polygon, whose points are all located within the image.
If some of the polygon's points are not inside the image, the polygon gets trimmed,
which means that some points can disappear or their coordinates be modified.
Trim a polygon to an image's boundaries, with non-negative coordinates.
:param polygon: A polygon to trim.
:type: list(list(int or float) or tuple(int or float)) or tuple(tuple(int or float) or list(int or float))
:param image_width int: Width of the image.
:param image_height int: Height of the image.
:returns: A polygon trimmed to the image's bounds.
Some points may appear as missing, as the trimming can deduplicate points.
The first and last point are always equal, to reproduce the behavior
of the Arkindex backend.
:rtype: list(list(int or float))
:raises AssertionError: When argument types are invalid or when the trimmed polygon
is entirely outside of the image's bounds.
"""
assert isinstance(
......@@ -232,15 +269,19 @@ def trim_polygon(polygon, image_width: int, image_height: int):
def revert_orientation(element, polygon):
"""Update the coordinates of the polygon of a child element based on the orientation of
"""
Update the coordinates of the polygon of a child element based on the orientation of
its parent.
:param element: Parent element
:type element: Element|CachedElement
This method should be called before sending any polygon to Arkindex, to undo the possible
orientation applied by :meth:`Element.open_image`.
:param element: Parent element.
:type element: Element or CachedElement
:param polygon: Polygon corresponding to the child element.
:type polygon: list
:return: A polygon with updated coordinates
:rtype: list
:type polygon: list(list(int or float))
:return: A polygon with updated coordinates.
:rtype: list(list(int))
"""
from arkindex_worker.models import Element
from arkindex_worker.cache import CachedElement
......
# -*- coding: utf-8 -*-
"""
Wrappers around API results to provide more convenient attribute access and IIIF helpers.
"""
import tempfile
from contextlib import contextmanager
from typing import Optional
from requests import HTTPError
......@@ -55,7 +60,7 @@ class MagicDict(dict):
class Element(MagicDict):
"""
Describes any kind of element.
Describes an Arkindex element.
"""
def resize_zone_url(self, size="full"):
......@@ -66,10 +71,13 @@ class Element(MagicDict):
parts[-3] = size
return "/".join(parts)
def image_url(self, size="full"):
def image_url(self, size="full") -> Optional[str]:
"""
When possible, will return the S3 URL for images, so an ML worker can bypass IIIF servers
:param size: Subresolution of the image.
Build an URL to access the image.
When possible, will return the S3 URL for images, so an ML worker can bypass IIIF servers.
:param str size: Subresolution of the image, following the syntax of the IIIF resize parameter.
:returns: An URL to the image, or None if the element does not have an image.
:rtype: str or None
"""
if not self.get("zone"):
return
......@@ -82,7 +90,11 @@ class Element(MagicDict):
return "{}full/{}/0/default.jpg".format(url, size)
@property
def requires_tiles(self):
def requires_tiles(self) -> bool:
"""
:return bool: Whether or not downloading and combining IIIF tiles will be necessary
to retrieve this element's image. Will be False if the element has no image.
"""
if not self.get("zone") or self.zone.image.get("s3_url"):
return False
max_width = self.zone.image.server.max_width or float("inf")
......@@ -92,15 +104,39 @@ class Element(MagicDict):
def open_image(self, *args, max_size=None, use_full_image=False, **kwargs):
"""
Open this element's image as a Pillow image.
If use_full_image is False:
Use zone url, instead of the full image url to be able to get
single page from double page image.
Else:
This does not crop the image to the element's polygon.
:param max_size: Subresolution of the image.
Open this element's image using Pillow, rotating and mirroring it according
to the ``rotation_angle`` and ``mirrored`` attributes.
When tiling is not required to download the image, and no S3 URL is available
to bypass IIIF servers, the image will be cropped to the rectangle bounding box
of the ``zone.polygon`` attribute.
.. warning::
This method implicitly applies the element's orientation to the image.
If your process uses the returned image to find more polygons and send them
back to Arkindex, use the :meth:`arkindex_worker.image.revert_orientation`
helper to undo the orientation on all polygons before sending them, as the
Arkindex API expects unoriented polygons.
Although not recommended, you can bypass this behavior by passing
``rotation_angle=0, mirrored=False`` as keyword arguments.
:param max_size: The maximum size of the requested image.
:type max_size: int or None
:param bool use_full_image: Ignore the ``zone.polygon`` and always
retrieve the image without cropping.
:param \\*args: Positional arguments passed to :meth:`arkindex_worker.image.open_image`.
:param \\**kwargs: Keyword arguments passed to :meth:`arkindex_worker.image.open_image`.
:raises ValueError: When the element does not have an image.
:raises NotImplementedError: When the ``max_size`` parameter is set,
but the IIIF server's configuration requires downloading and combining tiles
to retrieve the image.
:raises NotImplementedError: When an S3 URL has been used to download the image,
but the URL has expired. Re-fetching the URL automatically is not supported.
:return PIL.Image: A Pillow image.
"""
if not self.get("zone"):
raise ValueError("Element {} has no zone".format(self.id))
......@@ -164,7 +200,15 @@ class Element(MagicDict):
def open_image_tempfile(self, format="jpeg", *args, **kwargs):
"""
Get the element's image as a temporary file stored on the disk.
To be used as a context manager: with element.open_image_tempfile() as f: ...
To be used as a context manager::
with element.open_image_tempfile() as f:
...
:param format str: File format to use the store the image on the disk.
Must be a format supported by Pillow.
:param \\*args: Positional arguments passed to :meth:`arkindex_worker.image.open_image`.
:param \\**kwargs: Keyword arguments passed to :meth:`arkindex_worker.image.open_image`.
"""
with tempfile.NamedTemporaryFile() as f:
self.open_image(*args, **kwargs).save(f, format=format)
......
# -*- coding: utf-8 -*-
"""
Generator for the ``ml_report.json`` file, to report created worker results and exceptions.
"""
import json
import traceback
from collections import Counter
......@@ -10,6 +14,10 @@ from arkindex_worker import logger
class Reporter(object):
"""
Helper to generate an ``ml_report.json`` artifact.
"""
def __init__(
self, name="Unknown worker", slug="unknown-slug", version=None, **kwargs
):
......@@ -47,14 +55,21 @@ class Reporter(object):
def process(self, element_id):
"""
Report that a specific element ID is being processed.
:param element_id: ID of the element being processed.
:type element_id: str or uuid.UUID
"""
# Just call the element initializer
self._get_element(element_id)
def add_element(self, parent_id, type, type_count=1):
"""
Report creating a single element with a parent.
Multiple elements with the same type and parent can be declared with the type_count parameter.
Report creating an element as a child of another.
:param parent_id: ID of the parent element.
:type parent_id: str or uuid.UUID
:param type str: Slug of the type of the child element.
:param type_count int: How many elements of this type were created. Defaults to 1.
"""
elements = self._get_element(parent_id)["elements"]
elements.setdefault(type, 0)
......@@ -63,6 +78,10 @@ class Reporter(object):
def add_classification(self, element_id, class_name):
"""
Report creating a classification on an element.
:param element_id: ID of the element.
:type element_id: str or uuid.UUID
:param class_name str: Name of the ML class of the new classification.
"""
classifications = self._get_element(element_id)["classifications"]
classifications.setdefault(class_name, 0)
......@@ -70,7 +89,14 @@ class Reporter(object):
def add_classifications(self, element_id, classifications):
"""
Report one or more classifications at once.
Report creating one or more classifications at once on an element.
:param element_id: ID of the element.
:type element_id: str or uuid.UUID
:param classifications: List of classifications.
Each classification is represented as a ``dict`` with a ``class_name`` key
holding the name of the ML class being used.
:type classifications: List[Dict[str, str]]
"""
assert isinstance(
classifications, list
......@@ -87,31 +113,63 @@ class Reporter(object):
def add_transcription(self, element_id, count=1):
"""
Report creating a transcription on an element.
Multiple transcriptions with the same parent can be declared with the type_count parameter.
:param element_id: ID of the element.
:type element_id: str or uuid.UUID
:param count int: Number of transcriptions created at once, defaults to 1.
"""
self._get_element(element_id)["transcriptions"] += count
def add_entity(self, element_id, entity_id, type, name):
"""
Report creating an entity from an element.
Report creating an entity on an element.
:param element_id: ID of the element.
:type element_id: str or uuid.UUID
:param entity_id str: ID of the new entity.
:param type str: Type of the entity.
:param name str: Name of the entity.
"""
entities = self._get_element(element_id)["entities"]
entities.append({"id": entity_id, "type": type, "name": name})
def add_entity_link(self, *args, **kwargs):
"""
Report creating an entity link. Not currently supported.
:raises NotImplementedError:
"""
raise NotImplementedError
def add_entity_role(self, *args, **kwargs):
"""
Report creating an entity role. Not currently supported.
:raises NotImplementedError:
"""
raise NotImplementedError
def add_metadata(self, element_id, metadata_id, type, name):
"""
Report creating a metadata from an element.
:param element_id: ID of the element.
:type element_id: str or uuid.UUID
:param metadata_id str: ID of the new metadata.
:param type str: Type of the metadata.
:param name str: Name of the metadata.
"""
metadata = self._get_element(element_id)["metadata"]
metadata.append({"id": metadata_id, "type": type, "name": name})
def error(self, element_id, exception):
"""
Report that a Python exception occurred when processing an element.
:param element_id: ID of the element.
:type element_id: str or uuid.UUID
:param exception Exception: A Python exception.
"""
error_data = {
"class": exception.__class__.__name__,
"message": str(exception),
......@@ -129,6 +187,12 @@ class Reporter(object):
self._get_element(element_id)["errors"].append(error_data)
def save(self, path):
"""
Save the ML report to the specified path.
:param path: Path to save the ML report to.
:type path: str or pathlib.Path
"""
logger.info(f"Saving ML report to {path}")
with open(path, "w") as f:
json.dump(self.report_data, f)
# -*- coding: utf-8 -*-
"""
General utility functions and classes.
"""
import datetime
from timeit import default_timer
class Timer(object):
"""
A context manager to help measure execution times
A context manager to help measure execution times. Example usage::
with Timer() as t:
# do something interesting
print(t.delta) # X days, X:XX:XX
"""
def __init__(self):
......
# -*- coding: utf-8 -*-
"""
Base classes to implement Arkindex workers.
"""
import json
import os
import sys
......@@ -21,10 +25,29 @@ from arkindex_worker.worker.version import WorkerVersionMixin # noqa: F401
class ActivityState(Enum):
"""
Processing state of an element.
"""
Queued = "queued"
"""
The element has not yet been processed by a worker.
"""
Started = "started"
"""
The element is being processed by a worker.
"""
Processed = "processed"
"""
The element has been successfully processed by a worker.
"""
Error = "error"
"""
An error occurred while processing this element.
"""
class ElementsWorker(
......@@ -36,6 +59,13 @@ class ElementsWorker(
EntityMixin,
MetaDataMixin,
):
"""
Base class for ML workers that operate on Arkindex elements.
This class inherits from numerous mixin classes found in other modules of
``arkindex.worker``, which provide helpers to read and write to the Arkindex API.
"""
def __init__(self, description="Arkindex Elements Worker", support_cache=False):
super().__init__(description, support_cache)
......@@ -52,11 +82,20 @@ class ElementsWorker(
nargs="+",
help="One or more Arkindex element ID",
)
self.classes = {}
self._worker_version_cache = {}
def list_elements(self):
"""
List the elements to be processed, either from the CLI arguments or
the cache database when enabled.
:return: An iterable of :class:`CachedElement` when cache support is enabled,
and a list of strings representing element IDs otherwise.
:rtype: Iterable[CachedElement] or List[str]
"""
assert not (
self.args.elements_list and self.args.element
), "elements-list and element CLI args shouldn't be both set"
......@@ -83,6 +122,12 @@ class ElementsWorker(
@property
def store_activity(self):
"""
Whether or not WorkerActivity support has been enabled on the DataImport
used to run this worker.
:rtype: bool
"""
if self.args.dev:
return False
assert (
......@@ -99,7 +144,10 @@ class ElementsWorker(
def run(self):
"""
Process every elements from the provided list
Implements an Arkindex worker that goes through each element returned by
:meth:`list_elements`. It calls :meth:`process_element`, catching exceptions
and reporting them using the :class:`Reporter`, and handles saving the report
once the process is complete as well as WorkerActivity updates when enabled.
"""
self.configure()
......@@ -178,13 +226,25 @@ class ElementsWorker(
sys.exit(1)
def process_element(self, element):
"""Override this method to analyze an Arkindex element from the provided list"""
"""
Override this method to implement your worker and process a single Arkindex element at once.
:param element: The element to process.
Will be a CachedElement instance if cache support is enabled,
and an Element instance otherwise.
:type element: Element or CachedElement
"""
def update_activity(self, element_id, state):
def update_activity(self, element_id, state) -> bool:
"""
Update worker activity for this element
Returns False when there is a conflict initializing the activity
Otherwise return True or the response payload
Update the WorkerActivity for this element and worker.
:param element_id: ID of the element.
:type element_id: str or uuid.UUID
:param state ActivityState: New WorkerActivity state for this element.
:returns bool: True if the update has been successful or WorkerActivity support is disabled.
False if the update has failed due to a conflict; this worker might have already processed
this element.
"""
if not self.store_activity:
logger.debug(
......
# -*- coding: utf-8 -*-
"""
The base class for all Arkindex workers.
"""
import argparse
import json
import logging
......@@ -28,10 +32,11 @@ from arkindex_worker.cache import (
)
def _is_500_error(exc):
def _is_500_error(exc) -> bool:
"""
Check if an Arkindex API error is a 50x
This is used to retry most API calls implemented here
Check if an Arkindex API error has a HTTP 5xx error code.
Used to retry most API calls in :class:`BaseWorker`.
:rtype: bool
"""
if not isinstance(exc, ErrorResponse):
return False
......@@ -40,7 +45,19 @@ def _is_500_error(exc):
class BaseWorker(object):
"""
Base class for Arkindex workers.
"""
def __init__(self, description="Arkindex Base Worker", support_cache=False):
"""
Initialize the worker.
:param description str: Description shown in the ``worker-...`` command line tool.
:param support_cache bool: Whether or not this worker supports the cache database.
Override the constructor and set this parameter to start using the cache database.
"""
self.parser = argparse.ArgumentParser(description=description)
# Setup workdir either in Ponos environment or on host's home
......@@ -71,13 +88,19 @@ class BaseWorker(object):
self.use_cache = False
@property
def is_read_only(self):
"""Worker cannot publish anything in dev mode or without a worker version ID"""
def is_read_only(self) -> bool:
"""
Whether or not the worker can publish data.
:returns: False when dev mode is enabled with the ``--dev`` CLI argument,
or when no worker version ID is provided.
:rtype: bool
"""
return self.args.dev or self.worker_version_id is None
def configure(self):
"""
Configure worker using cli args and environment variables
Configure worker using CLI args and environment variables.
"""
self.parser.add_argument(
"-c",
......@@ -220,7 +243,12 @@ class BaseWorker(object):
logger.debug("Cache is disabled")
def load_secret(self, name):
"""Load all secrets described in the worker configuration"""
"""
Load a Ponos secret by name.
:param str name: Name of the Ponos secret.
:raises Exception: When the secret cannot be loaded from the API nor the local secrets directory.
"""
secret = None
# Load from the backend
......@@ -274,18 +302,19 @@ class BaseWorker(object):
)
def request(self, *args, **kwargs):
"""
Proxy all Arkindex API requests with a retry mechanism
in case of 50X errors
The same API call will be retried 5 times, with an exponential sleep time
going through 3, 4, 8 and 16 seconds of wait between call.
If the 5th call still gives a 50x, the exception is re-raised
and the caller should catch it
Log messages are displayed before sleeping (when at least one exception occurred)
Wrapper around the ``ArkindexClient.request`` method.
The API call will be retried up to 5 times in case of HTTP 5xx errors,
with an exponential sleep time of 3, 4, 8 and 16 seconds between calls.
If the 5th call still causes an HTTP 5xx error, the exception is re-raised
and the caller should catch it.
Log messages are displayed when an HTTP 5xx error occurs, before waiting for the next call.
"""
return self.api_client.request(*args, **kwargs)
def add_arguments(self):
"""Override this method to add argparse argument to this worker"""
"""Override this method to add ``argparse`` arguments to this worker"""
def run(self):
"""Override this method to implement your own process"""
# -*- coding: utf-8 -*-
"""
ElementsWorker methods for classifications and ML classes.
"""
import os
from apistar.exceptions import ErrorResponse
......@@ -10,9 +14,15 @@ from arkindex_worker.models import Element
class ClassificationMixin(object):
"""
Mixin for the :class:`ElementsWorker` to add ``MLClass`` and ``Classification`` helpers.
"""
def load_corpus_classes(self, corpus_id):
"""
Load ML classes for the given corpus ID
Load all ML classes for the given corpus ID and store them in the ``self.classes`` cache.
:param corpus_id str: ID of the corpus.
"""
corpus_classes = self.api_client.paginate(
"ListCorpusMLClasses",
......@@ -25,8 +35,14 @@ class ClassificationMixin(object):
def get_ml_class_id(self, corpus_id, ml_class):
"""
Return the ID corresponding to the given class name on a specific corpus
This method will automatically create missing classes
Return the MLClass ID corresponding to the given class name on a specific corpus.
If no MLClass exists for this class name, a new one is created.
:param corpus_id: ID of the corpus, or None to use the ``ARKINDEX_CORPUS_ID`` environment variable.
:type corpus_id: str or None
:param ml_class str: Name of the MLClass.
:returns str: ID of the retrieved or created MLClass.
"""
if corpus_id is None:
corpus_id = os.environ.get("ARKINDEX_CORPUS_ID")
......@@ -64,7 +80,14 @@ class ClassificationMixin(object):
self, element, ml_class, confidence, high_confidence=False
):
"""
Create a classification on the given element through API
Create a classification on the given element through the API.
:param element: The element to create a classification on.
:type element: Element or CachedElement
:param ml_class str: Name of the MLClass to use.
:param confidence float: Confidence score for the classification. Must be between 0 and 1.
:param high_confidence bool: Whether or not the classification is of high confidence.
:returns dict: The created classification, as returned by the ``CreateClassification`` API endpoint.
"""
assert element and isinstance(
element, (Element, CachedElement)
......@@ -136,7 +159,23 @@ class ClassificationMixin(object):
def create_classifications(self, element, classifications):
"""
Create multiple classifications at once on the given element through the API
Create multiple classifications at once on the given element through the API.
:param element: The element to create classifications on.
:type element: Element or CachedElement
:param classifications: The classifications to create, as a list of dicts with the following keys:
class_name (str)
Name of the MLClass for this classification.
confidence (float)
Confidence score, between 0 and 1.
high_confidence (bool)
High confidence state of the classification.
:type classifications: List[Dict[str, Union[str, float, bool]]]
:returns: List of created classifications, as returned in the ``classifications`` field by
the ``CreateClassifications`` API endpoint.
:rtype: List[Dict[str, Union[str, float, bool]]]
"""
assert element and isinstance(
element, (Element, CachedElement)
......
# -*- coding: utf-8 -*-
"""
ElementsWorker methods for elements and element types.
"""
import uuid
from typing import Dict, List, Optional, Union
from typing import Dict, Iterable, List, Optional, Union
from peewee import IntegrityError
......@@ -17,10 +20,19 @@ class MissingTypeError(Exception):
class ElementMixin(object):
"""
Mixin for the :class:`ElementsWorker` to provide ``Element`` helpers.
"""
def check_required_types(self, corpus_id: str, *type_slugs: str) -> bool:
"""
Check that a corpus has a list of required element types,
and raise an exception if any of them are missing.
:param str corpus_id: ID of the corpus to check types on.
:param str \\*type_slugs: Type slugs to look for.
:returns bool: True if all of the specified type slugs have been found.
:raises MissingTypeError: If any of the specified type slugs were not found.
"""
assert isinstance(
corpus_id, (uuid.UUID, str)
......@@ -59,8 +71,7 @@ class ElementMixin(object):
:type polygon: list(list(int or float))
:param confidence: Optional confidence score, between 0.0 and 1.0.
:type confidence: float or None
:returns: UUID of the created element.
:rtype: str
:returns str: UUID of the created element.
"""
assert element and isinstance(
element, Element
......@@ -241,19 +252,53 @@ class ElementMixin(object):
def list_element_children(
self,
element,
folder=None,
name=None,
recursive=None,
type=None,
with_classes=None,
with_corpus=None,
with_has_children=None,
with_zone=None,
worker_version=None,
):
element: Union[Element, CachedElement],
folder: Optional[bool] = None,
name: Optional[str] = None,
recursive: Optional[bool] = None,
type: Optional[str] = None,
with_classes: Optional[bool] = None,
with_corpus: Optional[bool] = None,
with_has_children: Optional[bool] = None,
with_zone: Optional[bool] = None,
worker_version: Optional[str] = None,
) -> Union[Iterable[dict], Iterable[CachedElement]]:
"""
List children of an element
List children of an element.
:param element: Parent element to find children of.
:type element: Union[Element, CachedElement]
:param folder: Restrict to or exclude elements with folder types.
This parameter is not supported when caching is enabled.
:type folder: Optional[bool]
:param name: Restrict to elements whose name contain a substring (case-insensitive).
This parameter is not supported when caching is enabled.
:type name: Optional[str]
:param recursive: Look for elements recursively (grand-children, etc.)
This parameter is not supported when caching is enabled.
:type recursive: Optional[bool]
:param type: Restrict to elements with a specific type slug
This parameter is not supported when caching is enabled.
:type type: Optional[str]
:param with_classes: Include each element's classifications in the response.
This parameter is not supported when caching is enabled.
:type with_classes: Optional[bool]
:param with_corpus: Include each element's corpus in the response.
This parameter is not supported when caching is enabled.
:type with_corpus: Optional[bool]
:param with_has_children: Include the ``has_children`` attribute in the response,
indicating if this element has child elements of its own.
This parameter is not supported when caching is enabled.
:type with_has_children: Optional[bool]
:param with_zone: Include the ``zone`` attribute in the response,
holding the element's image and polygon.
This parameter is not supported when caching is enabled.
:type with_zone: Optional[bool]
:param worker_version: Restrict to elements created by a worker version with this UUID.
:type worker_version: Optional[str]
:return: An iterable of dicts from the ``ListElementChildren`` API endpoint,
or an iterable of :class:`CachedElement` when caching is enabled.
:rtype: Union[Iterable[dict], Iterable[CachedElement]]
"""
assert element and isinstance(
element, (Element, CachedElement)
......
# -*- coding: utf-8 -*-
"""
ElementsWorker methods for entities.
"""
import os
from enum import Enum
......@@ -10,6 +14,10 @@ from arkindex_worker.models import Element
class EntityType(Enum):
"""
Type of an entity.
"""
Person = "person"
Location = "location"
Subject = "subject"
......@@ -20,12 +28,25 @@ class EntityType(Enum):
class EntityMixin(object):
"""
Mixin for the :class:`ElementsWorker` to add ``Entity`` helpers.
"""
def create_entity(
self, element, name, type, corpus=None, metas=dict(), validated=None
):
"""
Create an entity on the given corpus through API
Return the ID of the created entity
Create an entity on the given corpus.
If cache support is enabled, a :class:`CachedEntity` will also be created.
:param element: An element on which the entity will be reported with the :class:`Reporter`.
This does not have any effect on the entity itself.
:type element: Element or CachedElement
:param name str: Name of the entity.
:param type EntityType: Type of the entity.
:param corpus: UUID of the corpus to create an entity on, or None to use the
value of the ``ARKINDEX_CORPUS_ID`` environment variable.
:type corpus: str or None
"""
if corpus is None:
corpus = os.environ.get("ARKINDEX_CORPUS_ID")
......@@ -86,7 +107,19 @@ class EntityMixin(object):
self, transcription, entity, offset, length, confidence=None
):
"""
Create a link between an existing entity and an existing transcription through API
Create a link between an existing entity and an existing transcription.
If cache support is enabled, a :class:`CachedTranscriptionEntity` will also be created.
:param transcription str: UUID of the existing transcription.
:param entity str: UUID of the existing entity.
:param offset int: Starting position of the entity in the transcription's text,
as a 0-based index.
:param length int: Length of the entity in the transcription's text.
:param confidence: Optional confidence score between 0 or 1.
:type confidence: float or None
:returns: A dict as returned by the ``CreateTranscriptionEntity`` API endpoint,
or None if the worker is in read-only mode.
:rtype: dict(str, str or int) or None
"""
assert transcription and isinstance(
transcription, str
......
# -*- coding: utf-8 -*-
"""
ElementsWorker methods for metadata.
"""
from enum import Enum
from arkindex_worker import logger
......@@ -6,20 +10,67 @@ from arkindex_worker.models import Element
class MetaType(Enum):
"""
Type of a metadata.
"""
Text = "text"
"""
A regular string with no special interpretation.
"""
HTML = "html"
"""
A metadata with a string value that should be interpreted as HTML content.
The allowed HTML tags are restricted for security reasons.
"""
Date = "date"
"""
A metadata with a string value that should be interpreted as a date.
The date should be formatted as an ISO 8601 date (``YYYY-MM-DD``).
"""
Location = "location"
# Element's original structure reference (intended to be indexed)
"""
A metadata with a string value that should be interpreted as a location.
"""
Reference = "reference"
"""
A metadata with a string value that should be interpreted as an external identifier
to this element, for example to preserve a link to the original data before it was
imported into Arkindex.
"""
Numeric = "numeric"
"""
A metadata with a floating point value.
"""
URL = "url"
"""
A metadata with a string value that should be interpreted as an URL.
Only the ``http`` and ``https`` schemes are allowed.
"""
class MetaDataMixin(object):
"""
Mixin for the :class:`ElementsWorker` to add ``MetaData`` helpers.
"""
def create_metadata(self, element, type, name, value, entity=None):
"""
Create a metadata on the given element through API
Create a metadata on the given element through API.
:param element Element: The element to create a metadata on.
:param type MetaType: Type of the metadata.
:param name str: Name of the metadata.
:param value str: Value of the metadata.
:param entity: UUID of an entity this metadata is related to.
:type entity: str or None
:returns str: UUID of the created metadata.
"""
assert element and isinstance(
element, Element
......
# -*- coding: utf-8 -*-
"""
ElementsWorker methods for transcriptions.
"""
from enum import Enum
......@@ -10,13 +13,37 @@ from arkindex_worker.models import Element
class TextOrientation(Enum):
"""
Orientation of a transcription's text.
"""
HorizontalLeftToRight = "horizontal-lr"
"""
The text is read from top to bottom then left to right.
This is the default when no orientation is specified.
"""
HorizontalRightToLeft = "horizontal-rl"
"""
The text is read from top to bottom then right to left.
"""
VerticalRightToLeft = "vertical-rl"
"""
The text is read from right to left then top to bottom.
"""
VerticalLeftToRight = "vertical-lr"
"""
The text is read from left to right then top to bottom.
"""
class TranscriptionMixin(object):
"""
Mixin for the :class:`ElementsWorker` to provide ``Transcription`` helpers.
"""
def create_transcription(
self,
element,
......@@ -26,6 +53,15 @@ class TranscriptionMixin(object):
):
"""
Create a transcription on the given element through the API.
:param element: Element to create a transcription on.
:type element: Element or CachedElement
:param text str: Text of the transcription.
:param confidence float: Confidence score, between 0 and 1.
:param orientation TextOrientation: Orientation of the transcription's text.
:returns: A dict as returned by the ``CreateTranscription`` API endpoint,
or None if the worker is in read-only mode.
:rtype: Dict[str, Union[str, float]] or None
"""
assert element and isinstance(
element, (Element, CachedElement)
......@@ -82,7 +118,21 @@ class TranscriptionMixin(object):
def create_transcriptions(self, transcriptions):
"""
Create multiple transcriptions at once on existing elements through the API.
Create multiple transcriptions at once on existing elements through the API,
and creates :class:`CachedTranscription` instances if cache support is enabled.
:param transcriptions: A list of dicts representing a transcription each, with the following keys:
element_id (str)
Required. UUID of the element to create this transcription on.
text (str)
Required. Text of the transcription.
confidence (float)
Required. Confidence score between 0 and 1.
orientation (:class:`TextOrientation`)
Optional. Orientation of the transcription's text.
:returns: A list of dicts as returned in the ``transcriptions`` field by the ``CreateTranscriptions`` API endpoint.
"""
assert transcriptions and isinstance(
......@@ -154,7 +204,23 @@ class TranscriptionMixin(object):
def create_element_transcriptions(self, element, sub_element_type, transcriptions):
"""
Create multiple sub elements with their transcriptions on the given element through API
Create multiple elements and transcriptions at once on a single parent element through the API.
:param element: Element to create elements and transcriptions on.
:type element: Element or CachedElement
:param str sub_element_type: Slug of the element type to use for the new elements.
:param transcriptions: A list of dicts representing an element and transcription each, with the following keys:
polygon (list(list(int or float)))
Required. Polygon of the element.
text (str)
Required. Text of the transcription.
confidence (float)
Required. Confidence score between 0 and 1.
orientation (:class:`TextOrientation`)
Optional. Orientation of the transcription's text.
:returns: A list of dicts as returned by the ``CreateElementTranscriptions`` API endpoint.
"""
assert element and isinstance(
element, (Element, CachedElement)
......@@ -285,7 +351,19 @@ class TranscriptionMixin(object):
self, element, element_type=None, recursive=None, worker_version=None
):
"""
List transcriptions on an element
List transcriptions on an element.
:param element: The element to list transcriptions on.
:type element: Element or CachedElement
:param element_type: Restrict to transcriptions whose elements have an element type with this slug.
:type element_type: Optional[str]
:param recursive: Include transcriptions of any descendant of this element, recursively.
:type recursive: Optional[bool]
:param worker_version: Restrict to transcriptions created by a worker version with this UUID.
:type worker_version: Optional[str]
:returns: An iterable of dicts representing each transcription,
or an iterable of CachedTranscription when cache support is enabled.
:rtype: Iterable[dict] or Iterable[CachedTranscription]
"""
assert element and isinstance(
element, (Element, CachedElement)
......
# -*- coding: utf-8 -*-
"""
ElementsWorker methods for worker versions.
"""
class WorkerVersionMixin(object):
"""
Mixin for the :class:`ElementsWorker` to provide ``WorkerVersion`` helpers.
"""
def get_worker_version(self, worker_version_id: str) -> dict:
"""
Get worker version from cache if possible, otherwise make API request
Retrieve a worker version, using the :class:`ElementsWorker`'s internal cache when possible.
:param str worker_version_id: ID of the worker version to retrieve.
:returns dict: The requested worker version, as returned by the ``RetrieveWorkerVersion`` API endpoint.
"""
if worker_version_id is None:
raise ValueError("No worker version ID")
......@@ -19,11 +29,11 @@ class WorkerVersionMixin(object):
def get_worker_version_slug(self, worker_version_id: str) -> str:
"""
Helper function to get the worker slug from element, classification or transcription.
Gets the worker version slug from cache if possible, otherwise makes an API request.
Returns None if there is no associated worker version.
Retrieve the slug of the worker of a worker version, from a worker version UUID.
Uses a worker version from the internal cache if possible, otherwise makes an API request.
:type worker_version_id: A worker version UUID
:param worker_version_id str: ID of the worker version to find a slug for.
:returns str: Slug of the worker of this worker version.
"""
worker_version = self.get_worker_version(worker_version_id)
return worker_version["worker"]["slug"]