Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • workers/base-worker
1 result
Show changes
Commits on Source (26)
Showing
with 762 additions and 78 deletions
Arkindex base Worker
====================
# Arkindex base Worker
An easy to use Python 3 high level API client, to build ML tasks.
## Create a new worker using our template
```
pip install --user cookiecutter
cookiecutter git@gitlab.com:arkindex/base-worker.git
cookiecutter git@gitlab.com:teklia/workers/base-worker.git
```
0.3.0
0.3.1-rc2
......@@ -94,7 +94,9 @@ class CachedElement(Model):
rotation_angle = IntegerField(default=0)
mirrored = BooleanField(default=False)
initial = BooleanField(default=False)
# Needed to filter elements with cache
worker_version_id = UUIDField(null=True)
worker_run_id = UUIDField(null=True)
confidence = FloatField(null=True)
class Meta:
......@@ -172,7 +174,9 @@ class CachedTranscription(Model):
text = TextField()
confidence = FloatField()
orientation = CharField(max_length=50)
# Needed to filter transcriptions with cache
worker_version_id = UUIDField(null=True)
worker_run_id = UUIDField(null=True)
class Meta:
database = db
......@@ -189,7 +193,7 @@ class CachedClassification(Model):
class_name = TextField()
confidence = FloatField()
state = CharField(max_length=10)
worker_version_id = UUIDField(null=True)
worker_run_id = UUIDField(null=True)
class Meta:
database = db
......@@ -206,7 +210,7 @@ class CachedEntity(Model):
name = TextField()
validated = BooleanField(default=False)
metas = JSONField(null=True)
worker_version_id = UUIDField(null=True)
worker_run_id = UUIDField(null=True)
class Meta:
database = db
......@@ -224,7 +228,7 @@ class CachedTranscriptionEntity(Model):
entity = ForeignKeyField(CachedEntity, backref="transcription_entities")
offset = IntegerField(constraints=[Check("offset >= 0")])
length = IntegerField(constraints=[Check("length > 0")])
worker_version_id = UUIDField(null=True)
worker_run_id = UUIDField(null=True)
confidence = FloatField(null=True)
class Meta:
......
......@@ -5,7 +5,7 @@ Wrappers around API results to provide more convenient attribute access and IIIF
import tempfile
from contextlib import contextmanager
from typing import Generator, Optional
from typing import Generator, List, Optional
from PIL import Image
from requests import HTTPError
......@@ -93,6 +93,19 @@ class Element(MagicDict):
url += "/"
return "{}full/{}/0/default.jpg".format(url, size)
@property
def polygon(self) -> List[float]:
"""
Access an Element's polygon.
This is a shortcut to an Element's polygon, normally accessed via
its zone field via `zone.polygon`. This is mostly done
to facilitate access to this important field by matching
the [CachedElement][arkindex_worker.cache.CachedElement].polygon field.
"""
if not self.get("zone"):
raise ValueError("Element {} has no zone".format(self.id))
return self.zone.polygon
@property
def requires_tiles(self) -> bool:
"""
......
......@@ -14,6 +14,7 @@ from uuid import UUID
from apistar.exceptions import ErrorResponse
from arkindex_worker import logger
from arkindex_worker.models import Transcription
class Reporter(object):
......@@ -53,6 +54,8 @@ class Reporter(object):
"classifications": {},
# Created entities ({"id": "", "type": "", "name": ""}) from this element
"entities": [],
# Created transcription entities ({"transcription_id": "", "entity_id": ""}) from this element
"transcription_entities": [],
# Created metadata ({"id": "", "type": "", "name": ""}) from this element
"metadata": [],
"errors": [],
......@@ -141,6 +144,30 @@ class Reporter(object):
entities = self._get_element(element_id)["entities"]
entities.append({"id": entity_id, "type": type, "name": name})
def add_transcription_entity(
self,
entity_id: Union[str, UUID],
transcription: Transcription,
transcription_entity_id: Union[str, UUID],
):
"""
Report creating a transcription entity on an element.
:param entity_id: ID of the entity element.
:param transcription: Transcription to add the entity on
:param transcription_entity_id: ID of the transcription entity that is created.
"""
transcription_entities = self._get_element(transcription.element.id)[
"transcription_entities"
]
transcription_entities.append(
{
"transcription_id": transcription.id,
"entity_id": entity_id,
"transcription_entity_id": transcription_entity_id,
}
)
def add_entity_link(self, *args, **kwargs):
"""
Report creating an entity link. Not currently supported.
......
......@@ -45,6 +45,12 @@ def _is_500_error(exc: Exception) -> bool:
return 500 <= exc.status_code < 600
class ModelNotFoundError(Exception):
"""
Exception raised when the path towards the model is invalid
"""
class BaseWorker(object):
"""
Base class for Arkindex workers.
......@@ -94,6 +100,12 @@ class BaseWorker(object):
action="store_true",
default=False,
)
# To load models locally
self.parser.add_argument(
"--model-dir",
help=("The path to a local model's directory (development only)."),
type=Path,
)
# Call potential extra arguments
self.add_arguments()
......@@ -110,11 +122,10 @@ class BaseWorker(object):
self.work_dir = os.path.join(xdg_data_home, "arkindex")
os.makedirs(self.work_dir, exist_ok=True)
self.worker_version_id = os.environ.get("WORKER_VERSION_ID")
if not self.worker_version_id:
logger.warning(
"Missing WORKER_VERSION_ID environment variable, worker is in read-only mode"
)
# Store task ID. This is only available when running in production
# through a ponos agent
self.task_id = os.environ.get("PONOS_TASK")
self.worker_run_id = os.environ.get("ARKINDEX_WORKER_RUN_ID")
if not self.worker_run_id:
logger.warning(
......@@ -128,6 +139,7 @@ class BaseWorker(object):
# or in configure_for_developers() from the environment
self.corpus_id = None
self.user_configuration = {}
self.model_configuration = {}
self.support_cache = support_cache
# use_cache will be updated in configure() if the cache is supported and if there
# is at least one available sqlite database either given or in the parent tasks
......@@ -142,13 +154,9 @@ class BaseWorker(object):
Whether or not the worker can publish data.
:returns: False when dev mode is enabled with the ``--dev`` CLI argument,
when no worker version ID is provided or when no worker run ID is provided
when no worker run ID is provided
"""
return (
self.args.dev
or self.worker_version_id is None
or self.worker_run_id is None
)
return self.args.dev or self.worker_run_id is None
def setup_api_client(self):
"""
......@@ -211,6 +219,10 @@ class BaseWorker(object):
# Load worker version information
worker_version = worker_run["worker_version"]
# Store worker version id
self.worker_version_id = worker_version["id"]
self.worker_details = worker_version["worker"]
logger.info(
f"Loaded worker {self.worker_details['name']} revision {worker_version['revision']['hash'][0:7]} from API"
......@@ -236,6 +248,12 @@ class BaseWorker(object):
logger.info("Loaded user configuration from WorkerRun")
self.user_configuration.update(worker_configuration.get("configuration"))
# Load model version configuration when available
model_version = worker_run.get("model_version")
if model_version and model_version.get("configuration"):
logger.info("Loaded model version configuration from WorkerRun")
self.model_configuration.update(model_version.get("configuration"))
# if debug mode is set to true activate debug mode in logger
if self.user_configuration.get("debug"):
logger.setLevel(logging.DEBUG)
......@@ -245,12 +263,11 @@ class BaseWorker(object):
"""
Setup the necessary attribute when using the cache system of `Base-Worker`.
"""
task_id = os.environ.get("PONOS_TASK")
paths = None
if self.support_cache and self.args.database is not None:
self.use_cache = True
elif self.support_cache and task_id:
task = self.request("RetrieveTaskFromAgent", id=task_id)
elif self.support_cache and self.task_id:
task = self.request("RetrieveTaskFromAgent", id=self.task_id)
paths = retrieve_parents_cache_path(
task["parents"],
data_dir=os.environ.get("PONOS_DATA", "/data"),
......@@ -265,7 +282,9 @@ class BaseWorker(object):
), f"Database in {self.args.database} does not exist"
self.cache_path = self.args.database
else:
cache_dir = os.path.join(os.environ.get("PONOS_DATA", "/data"), task_id)
cache_dir = os.path.join(
os.environ.get("PONOS_DATA", "/data"), self.task_id
)
assert os.path.isdir(cache_dir), f"Missing task cache in {cache_dir}"
self.cache_path = os.path.join(cache_dir, "db.sqlite")
......@@ -335,6 +354,35 @@ class BaseWorker(object):
# By default give raw secret payload
return secret
def find_model_directory(self) -> Path:
"""
Find the local path to the model. This supports two modes:
- the worker runs in ponos, the model is available at `/data/current`
- the worker runs locally, the developer may specify it using either
- the `model_dir` configuration parameter
- the `--model-dir` CLI parameter
:return: Path to the model on disk
"""
if self.task_id:
# When running in production with ponos, the agent
# downloads the model and set it in the current task work dir
return Path(self.work_dir)
else:
model_dir = self.config.get("model_dir", self.args.model_dir)
if model_dir is None:
raise ModelNotFoundError(
"No path to the model was provided. "
"Please provide model_dir either through configuration "
"or as CLI argument."
)
model_dir = Path(model_dir)
if not model_dir.exists():
raise ModelNotFoundError(
f"The path {model_dir} does not link to any directory"
)
return model_dir
@retry(
retry=retry_if_exception(_is_500_error),
wait=wait_exponential(multiplier=2, min=3),
......
......@@ -104,7 +104,7 @@ class ClassificationMixin(object):
body={
"element": str(element.id),
"ml_class": self.get_ml_class_id(ml_class),
"worker_version": self.worker_version_id,
"worker_run_id": self.worker_run_id,
"confidence": confidence,
"high_confidence": high_confidence,
},
......@@ -120,7 +120,7 @@ class ClassificationMixin(object):
"class_name": ml_class,
"confidence": created["confidence"],
"state": created["state"],
"worker_version_id": self.worker_version_id,
"worker_run_id": self.worker_run_id,
}
]
CachedClassification.insert_many(to_insert).execute()
......@@ -130,15 +130,23 @@ class ClassificationMixin(object):
)
except ErrorResponse as e:
# Detect already existing classification
if (
e.status_code == 400
and "non_field_errors" in e.content
and "The fields element, worker_version, ml_class must make a unique set."
in e.content["non_field_errors"]
):
logger.warning(
f"This worker version has already set {ml_class} on element {element.id}"
)
if e.status_code == 400 and "non_field_errors" in e.content:
if (
"The fields element, worker_version, ml_class must make a unique set."
in e.content["non_field_errors"]
):
logger.warning(
f"This worker version has already set {ml_class} on element {element.id}"
)
elif (
"The fields element, worker_run, ml_class must make a unique set."
in e.content["non_field_errors"]
):
logger.warning(
f"This worker run has already set {ml_class} on element {element.id}"
)
else:
raise
return
# Propagate any other API error
......@@ -157,14 +165,10 @@ class ClassificationMixin(object):
Create multiple classifications at once on the given element through the API.
:param element: The element to create classifications on.
:param classifications: The classifications to create, as a list of dicts with the following keys:
class_name (str)
Name of the MLClass for this classification.
confidence (float)
Confidence score, between 0 and 1.
high_confidence (bool)
High confidence state of the classification.
:param classifications: The classifications to create, a list of dicts. Each of them contains
a **class_name** (str), the name of the MLClass for this classification;
a **confidence** (float), the confidence score, between 0 and 1;
a **high_confidence** (bool), the high confidence state of the classification.
:returns: List of created classifications, as returned in the ``classifications`` field by
the ``CreateClassifications`` API endpoint.
......@@ -205,7 +209,7 @@ class ClassificationMixin(object):
"CreateClassifications",
body={
"parent": str(element.id),
"worker_version": self.worker_version_id,
"worker_run_id": self.worker_run_id,
"classifications": classifications,
},
)["classifications"]
......@@ -223,7 +227,7 @@ class ClassificationMixin(object):
"class_name": created_cl["class_name"],
"confidence": created_cl["confidence"],
"state": created_cl["state"],
"worker_version_id": self.worker_version_id,
"worker_run_id": self.worker_run_id,
}
for created_cl in created_cls
]
......
......@@ -138,7 +138,7 @@ class ElementMixin(object):
"corpus": element.corpus.id,
"polygon": polygon,
"parent": element.id,
"worker_version": self.worker_version_id,
"worker_run_id": self.worker_run_id,
"confidence": confidence,
},
)
......@@ -233,7 +233,7 @@ class ElementMixin(object):
"CreateElements",
id=parent.id,
body={
"worker_version": self.worker_version_id,
"worker_run_id": self.worker_run_id,
"elements": elements,
},
)
......@@ -265,7 +265,7 @@ class ElementMixin(object):
"type": element["type"],
"image_id": image_id,
"polygon": element["polygon"],
"worker_version_id": self.worker_version_id,
"worker_run_id": self.worker_run_id,
"confidence": element.get("confidence"),
}
for idx, element in enumerate(elements)
......
......@@ -70,7 +70,7 @@ class EntityMixin(object):
"metas": metas,
"validated": validated,
"corpus": self.corpus_id,
"worker_version": self.worker_version_id,
"worker_run_id": self.worker_run_id,
},
)
self.report.add_entity(element.id, entity["id"], type.value, name)
......@@ -85,7 +85,7 @@ class EntityMixin(object):
"name": name,
"validated": validated if validated is not None else False,
"metas": metas,
"worker_version_id": self.worker_version_id,
"worker_run_id": self.worker_run_id,
}
]
CachedEntity.insert_many(to_insert).execute()
......@@ -96,7 +96,7 @@ class EntityMixin(object):
def create_transcription_entity(
self,
transcription: str,
transcription: Transcription,
entity: str,
offset: int,
length: int,
......@@ -106,7 +106,7 @@ class EntityMixin(object):
Create a link between an existing entity and an existing transcription.
If cache support is enabled, a `CachedTranscriptionEntity` will also be created.
:param transcription: UUID of the existing transcription.
:param transcription: Transcription to create the entity on.
:param entity: UUID of the existing entity.
:param offset: Starting position of the entity in the transcription's text,
as a 0-based index.
......@@ -116,8 +116,8 @@ class EntityMixin(object):
or None if the worker is in read-only mode.
"""
assert transcription and isinstance(
transcription, str
), "transcription shouldn't be null and should be of type str"
transcription, Transcription
), "transcription shouldn't be null and should be a Transcription"
assert entity and isinstance(
entity, str
), "entity shouldn't be null and should be of type str"
......@@ -140,27 +140,27 @@ class EntityMixin(object):
"entity": entity,
"length": length,
"offset": offset,
"worker_version_id": self.worker_version_id,
"worker_run_id": self.worker_run_id,
}
if confidence is not None:
body["confidence"] = confidence
transcription_ent = self.request(
"CreateTranscriptionEntity",
id=transcription,
id=transcription.id,
body=body,
)
# TODO: Report transcription entity creation
self.report.add_transcription_entity(entity, transcription, transcription_ent)
if self.use_cache:
# Store transcription entity in local cache
try:
CachedTranscriptionEntity.create(
transcription=transcription,
transcription=transcription.id,
entity=entity,
offset=offset,
length=length,
worker_version_id=self.worker_version_id,
worker_run_id=self.worker_run_id,
confidence=confidence,
)
except IntegrityError as e:
......
......@@ -7,6 +7,7 @@ from enum import Enum
from typing import Dict, List, Optional, Union
from arkindex_worker import logger
from arkindex_worker.cache import CachedElement
from arkindex_worker.models import Element
......@@ -59,7 +60,7 @@ class MetaType(Enum):
class MetaDataMixin(object):
def create_metadata(
self,
element: Element,
element: Union[Element, CachedElement],
type: MetaType,
name: str,
value: str,
......@@ -76,8 +77,8 @@ class MetaDataMixin(object):
:returns: UUID of the created metadata.
"""
assert element and isinstance(
element, Element
), "element shouldn't be null and should be of type Element"
element, (Element, CachedElement)
), "element shouldn't be null and should be of type Element or CachedElement"
assert type and isinstance(
type, MetaType
), "type shouldn't be null and should be of type MetaType"
......@@ -101,7 +102,7 @@ class MetaDataMixin(object):
"name": name,
"value": value,
"entity_id": entity,
"worker_version": self.worker_version_id,
"worker_run_id": self.worker_run_id,
},
)
self.report.add_metadata(element.id, metadata["id"], type.value, name)
......@@ -110,7 +111,7 @@ class MetaDataMixin(object):
def create_metadatas(
self,
element: Element,
element: Union[Element, CachedElement],
metadatas: List[
Dict[
str, Union[MetaType, str, Union[str, Union[int, float]], Optional[str]]
......@@ -129,8 +130,8 @@ class MetaDataMixin(object):
- entity_id : Union[str, None]
"""
assert element and isinstance(
element, Element
), "element shouldn't be null and should be of type Element"
element, (Element, CachedElement)
), "element shouldn't be null and should be of type Element or CachedElement"
assert metadatas and isinstance(
metadatas, list
......@@ -176,7 +177,6 @@ class MetaDataMixin(object):
"CreateMetaDataBulk",
id=element.id,
body={
"worker_version": self.worker_version_id,
"worker_run_id": self.worker_run_id,
"metadata_list": metas,
},
......@@ -187,7 +187,9 @@ class MetaDataMixin(object):
return created_metadatas
def list_element_metadata(self, element: Element) -> List[Dict[str, str]]:
def list_element_metadata(
self, element: Union[Element, CachedElement]
) -> List[Dict[str, str]]:
"""
List all metadata linked to an element.
This method does not support cache.
......@@ -195,7 +197,7 @@ class MetaDataMixin(object):
:param element: The element to list metadata on.
"""
assert element and isinstance(
element, Element
), "element shouldn't be null and should be of type Element"
element, (Element, CachedElement)
), "element shouldn't be null and should be of type Element or CachedElement"
return self.api_client.paginate("ListElementMetaData", id=element.id)
......@@ -82,7 +82,7 @@ class TranscriptionMixin(object):
id=element.id,
body={
"text": text,
"worker_version": self.worker_version_id,
"worker_run_id": self.worker_run_id,
"confidence": confidence,
"orientation": orientation.value,
},
......@@ -100,7 +100,7 @@ class TranscriptionMixin(object):
"text": created["text"],
"confidence": created["confidence"],
"orientation": created["orientation"],
"worker_version_id": self.worker_version_id,
"worker_run_id": self.worker_run_id,
}
]
CachedTranscription.insert_many(to_insert).execute()
......@@ -170,7 +170,7 @@ class TranscriptionMixin(object):
created_trs = self.request(
"CreateTranscriptions",
body={
"worker_version": self.worker_version_id,
"worker_run_id": self.worker_run_id,
"transcriptions": transcriptions_payload,
},
)["transcriptions"]
......@@ -188,7 +188,7 @@ class TranscriptionMixin(object):
"text": created_tr["text"],
"confidence": created_tr["confidence"],
"orientation": created_tr["orientation"],
"worker_version_id": self.worker_version_id,
"worker_run_id": self.worker_run_id,
}
for created_tr in created_trs
]
......@@ -283,7 +283,7 @@ class TranscriptionMixin(object):
id=element.id,
body={
"element_type": sub_element_type,
"worker_version": self.worker_version_id,
"worker_run_id": self.worker_run_id,
"transcriptions": transcriptions_payload,
"return_elements": True,
},
......@@ -320,7 +320,7 @@ class TranscriptionMixin(object):
"type": sub_element_type,
"image_id": element.image_id,
"polygon": transcription["polygon"],
"worker_version_id": self.worker_version_id,
"worker_run_id": self.worker_run_id,
}
)
......@@ -335,7 +335,7 @@ class TranscriptionMixin(object):
"orientation": transcription.get(
"orientation", TextOrientation.HorizontalLeftToRight
).value,
"worker_version_id": self.worker_version_id,
"worker_run_id": self.worker_run_id,
}
)
......
black==22.8.0
doc8==0.11.1
mkdocs==1.3.1
mkdocs==1.4.0
mkdocstrings==0.19.0
mkdocstrings-python==0.7.1
recommonmark==0.7.1
# GitLab CI for workers
This page describes how continuous integration (CI) is used in workers created
using the `base-worker` template.
For more information on creating workers, see
[Setting up a worker](../create).
## Default template
When creating a worker with our official template, a `.gitlab-ci.yml` file has
been included with a few actions that will run on every push you make.
The CI jobs will run in the following order:
<img style="display:block;float:none;margin-left:auto;margin-right:auto;" src="./pipeline.svg" alt="CI pipeline execution order">
## Git Flow
At Teklia, we use a simple version of [Git Flow][gitflow]:
- The `default` branch should always have validated code and should be deployable
in production at any time.
- Developments should happen in branches, with merge requests to enable code
review and Gitlab CI pipelines.
- Project maintainers should use Git tags to create official releases, by
updating the `VERSION` file and using the same version string as the tag name.
This process is reflected the template's `.gitlab-ci.yml` file.
## Linting
The `lint` job uses [pre-commit] to run source code linters on your
project and validate various rules:
- Checking your Python code is PEP8 compliant
- Auto-formatting your Python code using [black]
- Sort your Python imports
- Check you don't have any trailing white space
- Check your YAML files are well formatted
- Fix some common spelling errors
You can set up pre-commit to run locally too; see
[Activating the pre-commit hook](../create#activating-the-pre-commit-hook).
## Testing
The `test` job uses [tox] and [pytest] modules to run written unit
tests for your repository and avoid any kind of code regression.
Any unit test you have added to your project will be executed on each git push,
allowing you to check the validity of your code before merging it.
Unit tests allow you to prevent regressions in your code when making changes,
and find bugs before they make their way into production.
<!-- TODO:
For more information, see [Writing unit tests for your worker](../tests).
-->
## Building
When the `test` & `lint` jobs run successfully, the `docker` job runs. It will
try to build a docker image from your `Dockerfile`. This will check that your
`Dockerfile` is valid and builds an image successfully.
This build step is only used as a check, as Arkindex builds Docker images on
its own.
## Generating release notes
When the `docker` job is successful and the CI pipeline is running for a Git
tag, the `release-notes` job runs. It will list all the commits since the
previous tag and aggregate them to publish release notes on the GitLab project.
We provide an [open source docker image](https://gitlab.com/teklia/devops/) to build these release notes,
but you'll need to provide your own Gitlab access token so that the task can
publish release notes on your own repository.
You can generate an access token on the Gitlab's page [User Settings > Access Tokens](https://gitlab.com/-/profile/personal_access_tokens), with `api` scope.
The token must then be set as a CI Variable on your Gitlab project:
1. go to your project settings,
1. go to section **CI / CD**
1. click on `Expand` in the **Variables** section
1. add a new variable named `DEVOPS_GITLAB_TOKEN` whose value is your token
[black]: https://github.com/psf/black
[gitflow]: https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow
[pre-commit]: https://pre-commit.com/
[pytest]: https://docs.pytest.org/
[tox]: https://tox.readthedocs.io/
<svg id="mermaid-1611246541133" width="100%" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" height="252" style="max-width: 152.25001525878906px;" viewBox="0 0 152.25001525878906 252"><style>#mermaid-1611246541133{font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:16px;fill:#000000;}#mermaid-1611246541133 .error-icon{fill:#552222;}#mermaid-1611246541133 .error-text{fill:#552222;stroke:#552222;}#mermaid-1611246541133 .edge-thickness-normal{stroke-width:2px;}#mermaid-1611246541133 .edge-thickness-thick{stroke-width:3.5px;}#mermaid-1611246541133 .edge-pattern-solid{stroke-dasharray:0;}#mermaid-1611246541133 .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-1611246541133 .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-1611246541133 .marker{fill:#666;}#mermaid-1611246541133 .marker.cross{stroke:#666;}#mermaid-1611246541133 svg{font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:16px;}#mermaid-1611246541133 .label{font-family:"trebuchet ms",verdana,arial,sans-serif;color:#000000;}#mermaid-1611246541133 .label text{fill:#000000;}#mermaid-1611246541133 .node rect,#mermaid-1611246541133 .node circle,#mermaid-1611246541133 .node ellipse,#mermaid-1611246541133 .node polygon,#mermaid-1611246541133 .node path{fill:#eee;stroke:#999;stroke-width:1px;}#mermaid-1611246541133 .node .label{text-align:center;}#mermaid-1611246541133 .node.clickable{cursor:pointer;}#mermaid-1611246541133 .arrowheadPath{fill:#333333;}#mermaid-1611246541133 .edgePath .path{stroke:#666;stroke-width:1.5px;}#mermaid-1611246541133 .flowchart-link{stroke:#666;fill:none;}#mermaid-1611246541133 .edgeLabel{background-color:white;text-align:center;}#mermaid-1611246541133 .edgeLabel rect{opacity:0.5;background-color:white;fill:white;}#mermaid-1611246541133 .cluster rect{fill:hsl(210,66.6666666667%,95%);stroke:#26a;stroke-width:1px;}#mermaid-1611246541133 .cluster text{fill:#333;}#mermaid-1611246541133 div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:12px;background:hsl(-160,0%,93.3333333333%);border:1px solid #26a;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-1611246541133:root{--mermaid-font-family:"trebuchet ms",verdana,arial,sans-serif;}#mermaid-1611246541133 flowchart{fill:apa;}</style><g><g class="output"><g class="clusters"></g><g class="edgePaths"><g class="edgePath LS-test LE-docker" style="opacity: 1;" id="L-test-docker"><path class="path" d="M30.900001525878906,47L30.900001525878906,72L57.058711534135796,97" marker-end="url(#arrowhead283)" style="fill:none"></path><defs><marker id="arrowhead283" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" markerHeight="6" orient="auto"><path d="M 0 0 L 10 5 L 0 10 z" class="arrowheadPath" style="stroke-width: 1px; stroke-dasharray: 1px, 0px;"></path></marker></defs></g><g class="edgePath LS-lint LE-docker" style="opacity: 1;" id="L-lint-docker"><path class="path" d="M124.02500915527344,47L124.02500915527344,72L97.86629914701655,97" marker-end="url(#arrowhead284)" style="fill:none"></path><defs><marker id="arrowhead284" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" markerHeight="6" orient="auto"><path d="M 0 0 L 10 5 L 0 10 z" class="arrowheadPath" style="stroke-width: 1px; stroke-dasharray: 1px, 0px;"></path></marker></defs></g><g class="edgePath LS-docker LE-release-notes" style="opacity: 1;" id="L-docker-release-notes"><path class="path" d="M77.46250534057617,136L77.46250534057617,170.5L77.46250534057617,205" marker-end="url(#arrowhead285)" style="fill:none"></path><defs><marker id="arrowhead285" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" markerHeight="6" orient="auto"><path d="M 0 0 L 10 5 L 0 10 z" class="arrowheadPath" style="stroke-width: 1px; stroke-dasharray: 1px, 0px;"></path></marker></defs></g></g><g class="edgeLabels"><g class="edgeLabel" style="opacity: 1;" transform=""><g transform="translate(0,0)" class="label"><rect rx="0" ry="0" width="0" height="0"></rect><foreignObject width="0" height="0"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;"><span id="L-L-test-docker" class="edgeLabel L-LS-test' L-LE-docker"></span></div></foreignObject></g></g><g class="edgeLabel" style="opacity: 1;" transform=""><g transform="translate(0,0)" class="label"><rect rx="0" ry="0" width="0" height="0"></rect><foreignObject width="0" height="0"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;"><span id="L-L-lint-docker" class="edgeLabel L-LS-lint' L-LE-docker"></span></div></foreignObject></g></g><g class="edgeLabel" style="opacity: 1;" transform="translate(77.46250534057617,170.5)"><g transform="translate(-22.25,-9.5)" class="label"><rect rx="0" ry="0" width="44.5" height="19"></rect><foreignObject width="44.5" height="19"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;"><span id="L-L-docker-release-notes" class="edgeLabel L-LS-docker' L-LE-release-notes">on tag</span></div></foreignObject></g></g></g><g class="nodes"><g class="node default" style="opacity: 1;" id="flowchart-test-254" transform="translate(30.900001525878906,27.5)"><rect rx="0" ry="0" x="-22.900001525878906" y="-19.5" width="45.80000305175781" height="39" class="label-container"></rect><g class="label" transform="translate(0,0)"><g transform="translate(-12.900001525878906,-9.5)"><foreignObject width="25.800003051757812" height="19"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">test</div></foreignObject></g></g></g><g class="node default" style="opacity: 1;" id="flowchart-docker-255" transform="translate(77.46250534057617,116.5)"><rect rx="0" ry="0" x="-34.01667022705078" y="-19.5" width="68.03334045410156" height="39" class="label-container"></rect><g class="label" transform="translate(0,0)"><g transform="translate(-24.01667022705078,-9.5)"><foreignObject width="48.03334045410156" height="19"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">docker</div></foreignObject></g></g></g><g class="node default" style="opacity: 1;" id="flowchart-lint-256" transform="translate(124.02500915527344,27.5)"><rect rx="0" ry="0" x="-20.225006103515625" y="-19.5" width="40.45001220703125" height="39" class="label-container"></rect><g class="label" transform="translate(0,0)"><g transform="translate(-10.225006103515625,-9.5)"><foreignObject width="20.45001220703125" height="19"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">lint</div></foreignObject></g></g></g><g class="node default" style="opacity: 1;" id="flowchart-release-notes-259" transform="translate(77.46250534057617,224.5)"><rect rx="0" ry="0" x="-58.48333740234375" y="-19.5" width="116.9666748046875" height="39" class="label-container"></rect><g class="label" transform="translate(0,0)"><g transform="translate(-48.48333740234375,-9.5)"><foreignObject width="96.9666748046875" height="19"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">release-notes</div></foreignObject></g></g></g></g></g></g></svg>
\ No newline at end of file
# Setting up a new worker
This page will guide you through creating a new Arkindex worker locally and
preparing a development environment.
This guide assumes you are using Ubuntu 20.04 or later and have root access.
## Preparing your environment
This section will guide you through preparing your system to create a new
Arkindex worker from our [official template][base-worker].
### Installing system dependencies
To retrieve the Arkindex worker template, you will need to have both Git and
SSH. Git is a version control system that you will later use to manage multiple
versions of your worker. SSH allows secure connections to remote machines, and
will be used in our case to retrieve the template from a Git server.
#### To install system dependencies
1. Run the following command:
```
sudo apt install git ssh
```
### Checking your version of Python
Our Arkindex worker template requires Python 3.6 or later. Checking if a
compatible version of Python is installed avoids further issues in the setup
process.
#### To check your version of Python
1. Run the following command: `python3 --version`
This command will have an output similar to the following:
```
Python 3.6.9
```
### Installing Python
If you were unable to check your Python version as stated above because
`python3` was not found, you will need to install Python 3 on your system.
#### To install Python on Ubuntu
1. Run the following command:
```
sudo apt install python3 python3-pip python3-virtualenv
```
1. Check your Python version again, as instructed in the previous section.
### Installing Python dependencies
To bootstrap a new Arkindex worker, some Python dependencies will be required:
- [pre-commit] will be used to automatically check the
syntax of your source code.
- [tox] will be used to run unit tests.
<!--
TODO: Link to [unit tests](tests)
-->
- [cookiecutter] will be used to bootstrap the project.
- [virtualenvwrapper] will be used to manage Python virtual
environments.
#### To install Python dependencies
1. Run the following command:
```
pip3 install pre-commit tox cookiecutter virtualenvwrapper
```
1. Follow the
[official virtualenvwrapper setup instructions][virtualenvwrapper-setup]
until you are able to run `workon`.
`workon` should have an empty output, as no Python virtual environments have
been set up yet.
## Creating the project
This section will guide you through creating a new worker from our official
template and making it available on a GitLab instance.
### Creating a GitLab project
For a worker to be accessible from an Arkindex instance, it needs to be sent
to a repository on a GitLab project. A GitLab project will also allow you to
manage different versions of a worker and run
[automated checks](ci/index) on your code.
#### To create a GitLab project
1. Open the **New project** form [on GitLab.com](https://gitlab.com/projects/new)
or on another GitLab instance
1. Enter your worker name as the **Project name**
1. Define a **Project slug** related to your worker, e.g.:
- `tesseract` for a Tesseract worker
- `opencv-foo` for an OpenCV worker related to project Foo
1. Click on the **Create project** button
### Bootstrapping the project
This section guides you through using our [official template][base-worker]
to get a basic structure for your worker.
#### To bootstrap the project
1. Open a terminal and go to a folder in which you will want your worker to be.
1. Enter this command and fill in the required information:
```
cookiecutter git@gitlab.com:teklia/workers/base-worker.git
```
Cookiecutter will ask you for several options:
`slug`
: A slug for the worker. This should use lowercase alphanumeric characters or
underscores to meet the code formatting requirements that the template
automatically enforces via [black].
`name`
: A name for the worker, purely used for display purposes.
`description`
: A general description of the worker. This will be used to initialize the `README.md` of your repository as well as the `help` command output.
`worker_type`
: An arbitrary string purely used for display purposes.
For example:
- `recognizer`,
- `classifier`,
- `dla`,
- `entity-recognizer`, etc.
`author`
: A name for the worker's author. Usually your first and last name.
`email`
: Your e-mail address. This will be used to contact you if any administrative need arise
### Pushing to GitLab
This section guides you through pushing the newly created worker from your
system to the GitLab project's repository.
This section assumes you have Maintainer or Owner access to the GitLab project.
#### To push to GitLab
1. Enter the newly created directory, starting in `worker-` and ending with your
worker's slug.
1. Add your GitLab project as a Git remote:
```
git remote add origin git@my-gitlab-instance.com:path/to/worker.git
```
You will need to use your own instance's URL and the path to your own
project. For example, a project named `hello` in the `teklia` group
on `gitlab.com` will use the following command:
```
git remote add origin git@gitlab.com:teklia/hello.git
```
1. Push the new branch to GitLab:
```
git push --set-upstream origin master
```
If you want to push a different branch, you first need to create it. For example,
if you want to push to a new branch named `bootstrap`, you will use:
```
git checkout -b bootstrap
git push --set-upstream origin bootstrap
```
1. Open your GitLab project in a browser.
1. Click on the blue icon indicating that [CI](ci/index)
is running on your repository, and wait for it to turn green to confirm
everything worked.
## Setting up your development environment
This section guides you through setting up a Python development environment
specifically for your worker.
### Activating the pre-commit hook
The official template includes code syntax checks such as trailing whitespace,
as well as code linting using [black]. Those checks run on GitLab as soon
as you push new code, but it is possible to run those automatically when you
create new commits using the [pre-commit] hook.
#### To activate the pre-commit hook
1. Run `pre-commit install`.
### Setting up the Python virtual environment
To install Python dependencies that are specific to your worker, and prevent
other dependencies installed on your system from interfering, it is recommended
to use a virtual environment.
#### To set up a Python virtual environment
1. Run `mkvirtualenv my_worker`, where `my_worker` is any name of your choice.
1. Install your worker in editable mode: `pip install -e .`
[base-worker]: https://gitlab.com/teklia/workers/base-worker/
[black]: https://github.com/psf/black
[cookiecutter]: https://cookiecutter.readthedocs.io/
[pre-commit]: https://pre-commit.com/
[tox]: https://tox.readthedocs.io/
[virtualenvwrapper]: https://virtualenvwrapper.readthedocs.io
[virtualenvwrapper-setup]: https://virtualenvwrapper.readthedocs.io/en/latest/install.html
# Workers
Arkindex has a powerful system to run asynchronous tasks. Those are based on
Docker images, and can do about anything (ML processing, but also to import
data into Arkindex, export from Arkindex to another system or file format...)
This section consists of the following guides:
## Contents
* [Setting up a new worker](create)
* [Running your worker locally](run-local)
* [Maintaining a worker](maintenance)
* [GitLab CI for workers](ci/index)
* [YAML configuration](yaml)
* [Template structure](template-structure)
# Maintaining a worker
This page guides you through common tasks applied while maintaining an Arkindex
worker.
## Updating the template
To get the changes we make on our [official template][base-worker] to apply to
your worker, you will need to re-apply the template to the worker and resolve
any conflicts that may arise.
### To update the template
1. Run the following command:
```
cookiecutter base-worker -f --config-file YOURFILE.yaml --no-input
```
Where `YOURFILE.yaml` is the path of the YAML file you previously created.
1. Answer `yes` when Cookiecutter requests confirmation to delete and
re-download the template.
1. Using the Git diff, resolve the conflicts yourself as Cookiecutter will be
overwriting existing files.
[base-worker]: https://gitlab.com/teklia/workers/base-worker/
# Running your worker locally
Once you have implemented a worker, you can run it on some Arkindex elements
on your own machine to test it.
!!! warning
This section has been deprecated as of the latest version of base-worker.
## Retrieving credentials
For a worker to run properly, you will need two types of credentials:
- An API token that gives the worker access to the API
- A worker version ID that lets the worker send results to Arkindex and report
that those come from this particular worker version
### Retrieving a token
For the worker to run, you will need an Arkindex authentication token.
You can use your own account's token when testing on your own machine.
You can retrieve your personal API Token from your [profile page](https://doc.arkindex.org/users/auth/index.md#personal-token).
### Retrieving a worker version ID
A worker version ID will be required in order to publish results. If your worker
does not create any Arkindex element, classification, transcription, etc., you
may skip this step.
If this particular worker was already configured on this instance, you can use
its existing worker version ID; otherwise, you will need to ask an Arkindex
administrator to create a fake version ID.
#### To retrieve a worker version ID from an existing worker
1. Open a web browser and browse to the Arkindex instance.
2. In the top-right user menu, click on **My repositories**.
3. Click on your worker, listed in the **Workers** column.
4. Rewrite the URL in your browser's address bar, to look like
`https://<arkindex_url>/api/v1/workers/<worker_id>/versions/`
- Replace `process` by `api/v1`
- Add a slash character (`/`) at the end
In the JSON output from this API endpoint, the first value next to `"id"` is
the worker version ID.
#### To create a fake worker as an administrator
This action can only be done as an Arkindex administrator with shell access.
1. In the backend's Docker image, run:
```
arkindex fake_worker_version --name <NAME> --slug <SLUG> --url <URL>
```
Replace `<NAME>`, `<SLUG>` and `<URL>` with the name, slug and GitLab
repository URL, respectively.
A Git repository is created with a fake OAuth access token. A fake Git revision
is added to this repository, and a fake worker version from a fake worker is
linked to this revision. You should get the following output:
```
Created a worker version: 392bd299-bc8f-4ec6-aa3c-e6503ecc7730
```
!!! warning
This feature should only be used when a normal worker cannot be created using the Git workflow.
## Setting credentials
In a shell you need to set 3 environment variables to transmit your credentials
and Arkindex instance information to the worker:
`ARKINDEX_API_URL`
: URL that points to the root of the Arkindex instance you are using.
`ARKINDEX_API_TOKEN`
: The API token you retrieved earlier, on your profile page.
`WORKER_VERSION_ID`
: The worker version ID you retrieved earlier. Can be omitted if the worker does
not create new data in Arkindex.
### To set credentials for your worker
1. In a shell, run:
```sh
export ARKINDEX_API_URL="https://arkindex.teklia.com"
export ARKINDEX_API_TOKEN="YOUR_TOKEN_HERE"
export WORKER_VERSION_ID="xxxxx"
```
!!! warning
Do not add these instructions to a script such as `.bashrc`;
this would mean storing credentials in plaintext and can lead to security breaches
## Running your worker
With the credentials configured, you can now run your worker.
You will need a list of element IDs to run your worker on, which can be found
in the browser's address bar when browsing an element on Arkindex.
### To run your worker
1. Activate the Python environment: run `workon X` where `X` is the name of
your Python environment.
2. Run `worker-X`, where `X` is the slug of your worker, followed by
`--element=Y` where `Y` is the ID of an element. You can repeat `--element`
as many times as you need to process multiple elements.
# Template structure
When building a new worker from our [official template][base-worker], a file
structure gets created for you to ease the burden of setting up a Python
package, a Docker build, with the best development practices:
`.arkindex.yml`
: YAML configuration file that allows Arkindex to understand what it should do
with this repository.
To learn more about this file, see [YAML configuration](yaml.md).
`.cookiecutter.yaml`
: YAML file that stores the options you defined when creating a new worker.
This file can be reused to [fetch template updates][template-updates].
`.dockerignore`
: Lists which files to exclude from the Docker build context.
For more information, see the [Docker documentation][dockerignore].
`.flake8`
: Specifies configuration options for the Flake8 linter.
For more information, see the [Flake8 documentation][flake8].
`.gitignore`
: Lists which files to exclude from Git versioning.
For more information, see the [Git docs][gitignore].
`.gitlab-ci.yml`
: Configures the GitLab CI jobs and pipelines.
To learn more about the configuration we provide, see
[GitLab Continuous Integration for workers](ci/index).
`.isort.cfg`
: Configures the automatic Python import sorting rules.
For more information, see the [isort docs][isort].
`.pre-commit.config.yaml`
: Configures the [pre-commit hook](create#activating-the-pre-commit-hook).
`Dockerfile`
: Specifies how the Docker image will be built.
You can change the instructions in this file to update the image to the needs
of your worker, for example to install system dependencies.
`requirements.txt`
: Lists the Python dependencies your worker relies on. Those are automatically
installed by the default Dockerfile.
`tox.ini`
: Configures the Python unit test runner.
For more information, see the [tox docs][tox].
`setup.py`
: Configures the worker's Python package.
`VERSION`
: Official version number of your worker. Defaults to `0.1.0`.
`ci/build.sh`
: Script that gets run by [CI](ci/index) pipelines
to build the Docker image.
`tests/test_worker.py`
: An example unit test file.
<!--
TODO: For more information, see [Writing tests for your worker](tests).
-->
`worker_[slug]/__init__.py`
: Declares the folder as a Python package.
`worker_[slug]/worker.py`
: The core part of the worker. This is where you can write code that processes
Arkindex elements.
<!-- TODO:
For more information, see
[Implementing a Machine Learning worker](implement.md).
-->
[base-worker]: https://gitlab.com/teklia/workers/base-worker/
[dockerignore]: https://docs.docker.com/engine/reference/builder/#dockerignore-file
[flake8]: https://flake8.pycqa.org/en/latest/user/configuration.html
[gitignore]: https://git-scm.com/docs/gitignore
[isort]: https://pycqa.github.io/isort/docs/configuration/config_files/
[template-updates]: maintenance#updating-the-template
[tox]: https://tox.readthedocs.io/en/latest/config.html
docs/contents/workers/user_configuration/bool_config.png

28 KiB