Compare revisions

Eva Bardou · Eva Bardou · Yoann Schneider · Eva Bardou · Yoann Schneider · Yoann Schneider
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -55,8 +55,7 @@ test:
    - tox -- --junitxml=test-report.xml --durations=50
 test-cookiecutter:
-  # Needed till next release
+  image: python:slim
-  image: python:3.11
  stage: test
  cache:
@@ -68,6 +67,7 @@ test-cookiecutter:
    PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
    PRE_COMMIT_HOME: "$CI_PROJECT_DIR/.cache/pre-commit"
    ARKINDEX_API_SCHEMA_URL: schema.yml
+    DEBIAN_FRONTEND: non-interactive
  except:
    - schedules
@@ -75,6 +75,9 @@ test-cookiecutter:
  before_script:
    - pip install cookiecutter tox pre-commit
+    # Install curl and git
+    - apt-get update -q -y && apt-get install -q -y --no-install-recommends curl git
    # Configure git to be able to commit in the hook
    - git config --global user.email "crasher@teklia.com"
    - git config --global user.name "Crash Test"

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,8 +6,8 @@ repos:
      - id: ruff
        args: [--fix, --exit-non-zero-on-fix]
        exclude: "^worker-{{cookiecutter.slug}}/"
-  - repo: https://github.com/ambv/black
+  - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 23.1.0
+    rev: 23.11.0
    hooks:
    - id: black
  - repo: https://github.com/pre-commit/pre-commit-hooks

--- a/VERSION
+++ b/VERSION
-0.3.5
+0.3.6-rc1
--- a/arkindex_worker/models.py
+++ b/arkindex_worker/models.py
@@ -268,6 +268,13 @@ class Dataset(ArkindexModel):
    Describes an Arkindex dataset.
    """
+    @property
+    def filepath(self) -> str:
+        """
+        Generic filepath to the Dataset compressed archive.
+        """
+        return f"{self.id}.tar.zst"
 class Artifact(ArkindexModel):
    """

--- a/arkindex_worker/utils.py
+++ b/arkindex_worker/utils.py
@@ -179,8 +179,8 @@ def create_tar_zst_archive(
    # Create tar archive
    tar_fd, tar_archive, tar_hash = create_tar_archive(source)
-    zstd_fd, zstd_archive, zstd_hash = zstd_compress(tar_archive, destination)
+    zst_fd, zst_archive, zst_hash = zstd_compress(tar_archive, destination)
    close_delete_file(tar_fd, tar_archive)
-    return zstd_fd, zstd_archive, zstd_hash, tar_hash
+    return zst_fd, zst_archive, zst_hash, tar_hash
--- a/arkindex_worker/worker/__init__.py
+++ b/arkindex_worker/worker/__init__.py
@@ -306,7 +306,7 @@ class ElementsWorker(
 class MissingDatasetArchive(Exception):
    """
-    Exception raised when the compressed `.zstd` archive associated to
+    Exception raised when the compressed archive associated to
    a dataset isn't found in its task artifacts.
    """
@@ -366,14 +366,13 @@ class DatasetWorker(BaseWorker, DatasetMixin, TaskMixin):
        """
        task_id = uuid.UUID(dataset.task_id)
-        archive_name = f"{dataset.id}.zstd"
        for artifact in self.list_artifacts(task_id):
-            if artifact.path != archive_name:
+            if artifact.path != dataset.filepath:
                continue
            extra_dir = self.find_extras_directory()
-            archive = extra_dir / archive_name
+            archive = extra_dir / dataset.filepath
            archive.write_bytes(self.download_artifact(task_id, artifact).read())
            return archive

--- a/arkindex_worker/worker/base.py
+++ b/arkindex_worker/worker/base.py
@@ -217,6 +217,9 @@ class BaseWorker(object):
        # Define model_version_id from environment
        self.model_version_id = os.environ.get("ARKINDEX_MODEL_VERSION_ID")
+        # Define model_details from environment
+        self.model_details = {"id": os.environ.get("ARKINDEX_MODEL_ID")}
        # Load all required secrets
        self.secrets = {name: self.load_secret(Path(name)) for name in required_secrets}
@@ -259,6 +262,9 @@ class BaseWorker(object):
            # Set model_version ID as worker attribute
            self.model_version_id = model_version.get("id")
+            # Set model details as worker attribute
+            self.model_details = model_version.get("model")
        # Retrieve initial configuration from API
        self.config = worker_version["configuration"].get("configuration", {})
        if "user_configuration" in worker_version["configuration"]:

--- a/arkindex_worker/worker/element.py
+++ b/arkindex_worker/worker/element.py
@@ -32,7 +32,6 @@ class ElementMixin(object):
    def create_required_types(self, element_types: List[ElementType]):
        """Creates given element types in the corpus.
-        :param Corpus corpus: The corpus to create types on.
        :param element_types: The missing element types to create.
        """
        for element_type in element_types:

--- a/arkindex_worker/worker/entity.py
+++ b/arkindex_worker/worker/entity.py
@@ -336,7 +336,7 @@ class EntityMixin(object):
        List all entities in the worker's corpus
        This method does not support cache
        :param name: Filter entities by part of their name (case-insensitive)
-        :param parent Element: Restrict entities to those linked to all transcriptions of an element and all its descendants. Note that links to metadata are ignored.
+        :param parent: Restrict entities to those linked to all transcriptions of an element and all its descendants. Note that links to metadata are ignored.
        """
        query_params = {}

--- a/arkindex_worker/worker/metadata.py
+++ b/arkindex_worker/worker/metadata.py
@@ -121,8 +121,8 @@ class MetaDataMixin(object):
        Create multiple metadatas on an existing element.
        This method does not support cache.
-        :param element Element: The element to create multiple metadata on.
+        :param element: The element to create multiple metadata on.
-        :param metadata_list List(Dict): The list of dict whose keys are the following:
+        :param metadatas: The list of dict whose keys are the following:
            - type : MetaType
            - name : str
            - value : Union[str, Union[int, float]]

--- a/arkindex_worker/worker/training.py
+++ b/arkindex_worker/worker/training.py
@@ -37,15 +37,15 @@ def create_archive(path: DirPath) -> Tuple[Path, Hash, FileSize, Hash]:
    """
    assert path.is_dir(), "create_archive needs a directory"
-    zstd_descriptor, zstd_archive, archive_hash, content_hash = create_tar_zst_archive(
+    zst_descriptor, zst_archive, archive_hash, content_hash = create_tar_zst_archive(
        path
    )
    # Get content hash, archive size and hash
-    yield zstd_archive, content_hash, zstd_archive.stat().st_size, archive_hash
+    yield zst_archive, content_hash, zst_archive.stat().st_size, archive_hash
-    # Remove the zstd archive
+    # Remove the zst archive
-    close_delete_file(zstd_descriptor, zstd_archive)
+    close_delete_file(zst_descriptor, zst_archive)
 def build_clean_payload(**kwargs):

--- a/docs/contents/implem/configure.md
+++ b/docs/contents/implem/configure.md
@@ -77,9 +77,9 @@ This is also when the secrets (see [this section](../secrets/usage.md#declaring-
 An Arkindex-mode exclusive step is done after all that: the cache setup. Some workers benefit a lot, performance-wise, from having a SQLite cache artifact from previous workers. This is mostly used in processes with multiple workers with dependencies, where the second worker needs the results of the first one to work. The database is initialized, the tables created and its version checked as it must match the one supported by the Arkindex instances. The database is then merged with any other database generated by previous worker runs.
 ## Developer mode
-In the developer mode, the worker execution is not linked to anything on Arkindex. Therefore, the only configuration the worker can use is provided via the `--config` CLI argument. It supports YAML-formatted file and it should be similar to the `configuration` section of the [worker configuration file](../workers/yaml/#single-worker-configuration), without the `user_configuration` details. More details about how to create the local worker configuration are available in [this section](../workers/run-local/).
+In the developer mode, the worker execution is not linked to anything on Arkindex. Therefore, the only configuration the worker can use is provided via the `--config` CLI argument. It supports YAML-formatted file and it should be similar to the `configuration` section of the [worker configuration file](../workers/yaml.md#single-worker-configuration), without the `user_configuration` details. More details about how to create the local worker configuration are available in [this section](../workers/run-local.md).
-The multiple configuration sources from the Arkindex-mode are merged into a unique one here. The configuration parameters are parsed as well as the list of required secrets. The secrets are loaded using a local Arkindex client. Again, see the [section about local execution](../workers/run-local/) for more details.
+The multiple configuration sources from the Arkindex-mode are merged into a unique one here. The configuration parameters are parsed as well as the list of required secrets. The secrets are loaded using a local Arkindex client. Again, see the [section about local execution](../workers/run-local.md) for more details.
 One information cannot be retrieved directly from the configuration file and is required in some cases: the ID of the Arkindex corpus which the elements processed belong to. This is retrieved via the `ARKINDEX_CORPUS_ID` environment variable.
@@ -115,6 +115,9 @@ Many attributes are set on the worker during at the configuration stage. Here is
 `model_version_id`
 : The ID of the model version linked to the current `WorkerRun` object on Arkindex. You may set it in developer mode via the `ARKINDEX_MODEL_VERSION_ID` environment variable.
+`model_details`
+: The details of the model for the model version linked to the current `WorkerRun` object on Arkindex. You may populate it in developer mode via the `ARKINDEX_MODEL_ID` environment variable.
 `process_information`
 : The details about the process parent to this worker execution. Only set in Arkindex mode.

--- a/docs/contents/workers/ci/index.md
+++ b/docs/contents/workers/ci/index.md
@@ -4,7 +4,7 @@ This page describes how continuous integration (CI) is used in workers created
 using the `base-worker` template.
 For more information on creating workers, see
-[Setting up a worker](../create).
+[Setting up a worker](../create.md).
 ## Default template
@@ -41,7 +41,7 @@ project and validate various rules:
 - Fix some common spelling errors
 You can set up pre-commit to run locally too; see
-[Activating the pre-commit hook](../create#activating-the-pre-commit-hook).
+[Activating the pre-commit hook](../create.md#activating-the-pre-commit-hook).
 ## Testing
@@ -82,9 +82,9 @@ You can generate an access token on the Gitlab's page [User Settings > Access To
 The token must then be set as a CI Variable on your Gitlab project:
 1. go to your project settings,
-1. go to section **CI / CD**
+2. go to section **CI / CD**
-1. click on `Expand` in the **Variables** section
+3. click on `Expand` in the **Variables** section
-1. add a new variable named `DEVOPS_GITLAB_TOKEN` whose value is your token
+4. add a new variable named `DEVOPS_GITLAB_TOKEN` whose value is your token
 [black]: https://github.com/psf/black
 [gitflow]: https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow

--- a/docs/contents/workers/create.md
+++ b/docs/contents/workers/create.md
@@ -54,7 +54,7 @@ If you were unable to check your Python version as stated above because
   sudo apt install python3 python3-pip python3-virtualenv
   ```
-1. Check your Python version again, as instructed in the previous section.
+2. Check your Python version again, as instructed in the previous section.
 ### Installing Python dependencies
@@ -80,7 +80,7 @@ TODO: Link to [unit tests](tests)
   pip3 install pre-commit tox cookiecutter virtualenvwrapper
   ```
-1. Follow the
+2. Follow the
   [official virtualenvwrapper setup instructions][virtualenvwrapper-setup]
   until you are able to run `workon`.
@@ -97,21 +97,21 @@ template and making it available on a GitLab instance.
 For a worker to be accessible from an Arkindex instance, it needs to be sent
 to a repository on a GitLab project. A GitLab project will also allow you to
 manage different versions of a worker and run
-[automated checks](ci/index) on your code.
+[automated checks](ci/index.md) on your code.
 #### To create a GitLab project
 1. Open the **New project** form [on GitLab.com](https://gitlab.com/projects/new)
   or on another GitLab instance
-1. Enter your worker name as the **Project name**
+2. Enter your worker name as the **Project name**
-1. Define a **Project slug** related to your worker, e.g.:
+3. Define a **Project slug** related to your worker, e.g.:
-   - `tesseract` for a Tesseract worker
+      - `tesseract` for a Tesseract worker
-   - `opencv-foo` for an OpenCV worker related to project Foo
+      - `opencv-foo` for an OpenCV worker related to project Foo
-1. Click on the **Create project** button
+4. Click on the **Create project** button
 ### Bootstrapping the project
@@ -122,7 +122,7 @@ to get a basic structure for your worker.
 1. Open a terminal and go to a folder in which you will want your worker to be.
-1. Enter this command and fill in the required information:
+2. Enter this command and fill in the required information:
   ```
   cookiecutter git@gitlab.teklia.com:workers/base-worker.git
@@ -171,7 +171,7 @@ This section assumes you have Maintainer or Owner access to the GitLab project.
 1. Enter the newly created directory, starting in `worker-` and ending with your
   worker's slug.
-1. Add your GitLab project as a Git remote:
+2. Add your GitLab project as a Git remote:
   ```
   git remote add origin git@my-gitlab-instance.com:path/to/worker.git
@@ -185,7 +185,7 @@ This section assumes you have Maintainer or Owner access to the GitLab project.
   git remote add origin git@gitlab.com:teklia/hello.git
   ```
-1. Push the new branch to GitLab:
+3. Push the new branch to GitLab:
   ```
   git push --set-upstream origin master
@@ -199,9 +199,9 @@ This section assumes you have Maintainer or Owner access to the GitLab project.
   git push --set-upstream origin bootstrap
   ```
-1. Open your GitLab project in a browser.
+4. Open your GitLab project in a browser.
-1. Click on the blue icon indicating that [CI](ci/index)
+5. Click on the blue icon indicating that [CI](ci/index.md)
   is running on your repository, and wait for it to turn green to confirm
   everything worked.
@@ -230,7 +230,7 @@ to use a virtual environment.
 #### To set up a Python virtual environment
 1. Run `mkvirtualenv my_worker`, where `my_worker` is any name of your choice.
-1. Install your worker in editable mode: `pip install -e .`
+2. Install your worker in editable mode: `pip install -e .`
 [base-worker]: https://gitlab.teklia.com/workers/base-worker/
 [black]: https://github.com/psf/black

--- a/docs/contents/workers/index.md
+++ b/docs/contents/workers/index.md
@@ -8,9 +8,9 @@ This section consists of the following guides:
 ## Contents
-* [Setting up a new worker](create)
+* [Setting up a new worker](create.md)
-* [Running your worker locally](run-local)
+* [Running your worker locally](run-local.md)
-* [Maintaining a worker](maintenance)
+* [Maintaining a worker](maintenance.md)
-* [GitLab CI for workers](ci/index)
+* [GitLab CI for workers](ci/index.md)
-* [YAML configuration](yaml)
+* [YAML configuration](yaml.md)
-* [Template structure](template-structure)
+* [Template structure](template-structure.md)
--- a/docs/contents/workers/maintenance.md
+++ b/docs/contents/workers/maintenance.md
@@ -19,10 +19,10 @@ any conflicts that may arise.
   Where `YOURFILE.yaml` is the path of the YAML file you previously created.
-1. Answer `yes` when Cookiecutter requests confirmation to delete and
+2. Answer `yes` when Cookiecutter requests confirmation to delete and
   re-download the template.
-1. Using the Git diff, resolve the conflicts yourself as Cookiecutter will be
+3. Using the Git diff, resolve the conflicts yourself as Cookiecutter will be
   overwriting existing files.
 [base-worker]: https://gitlab.teklia.com/workers/base-worker/
--- a/docs/contents/workers/template-structure.md
+++ b/docs/contents/workers/template-structure.md
@@ -28,14 +28,14 @@ package, a Docker build, with the best development practices:
 `.gitlab-ci.yml`
 : Configures the GitLab CI jobs and pipelines.
  To learn more about the configuration we provide, see
-  [GitLab Continuous Integration for workers](ci/index).
+  [GitLab Continuous Integration for workers](ci/index.md).
 `.isort.cfg`
 : Configures the automatic Python import sorting rules.
  For more information, see the [isort docs][isort].
 `.pre-commit.config.yaml`
-: Configures the [pre-commit hook](create#activating-the-pre-commit-hook).
+: Configures the [pre-commit hook](create.md#activating-the-pre-commit-hook).
 `Dockerfile`
 : Specifies how the Docker image will be built.
@@ -57,7 +57,7 @@ package, a Docker build, with the best development practices:
 : Official version number of your worker. Defaults to `0.1.0`.
 `ci/build.sh`
-: Script that gets run by [CI](ci/index) pipelines
+: Script that gets run by [CI](ci/index.md) pipelines
  to build the Docker image.
@@ -85,5 +85,5 @@ For more information, see
 [flake8]: https://flake8.pycqa.org/en/latest/user/configuration.html
 [gitignore]: https://git-scm.com/docs/gitignore
 [isort]: https://pycqa.github.io/isort/docs/configuration/config_files/
-[template-updates]: maintenance#updating-the-template
+[template-updates]: maintenance.md#updating-the-template
 [tox]: https://tox.readthedocs.io/en/latest/config.html
--- a/docs/ref/elements_worker.md
+++ b/docs/ref/elements_worker.md
-# Elements Worker
+# Arkindex Workers
 ::: arkindex_worker.worker
--- a/docs/releases.md
+++ b/docs/releases.md
@@ -42,7 +42,7 @@ Released on **14 Sept 2023** &bull; View on [Gitlab](https://gitlab.teklia.com/w
 - Base-worker now uses [ruff](https://github.com/charliermarsh/ruff) for linting. This tool replaces `isort` and `flake8`.
 - New Arkindex API helper to update an element, calling [PartialUpdateElement](https://demo.arkindex.org/api-docs/#tag/elements/operation/PartialUpdateElement).
 - New Arkindex API helper to list an element's parents, calling [ListElementParents](https://demo.arkindex.org/api-docs/#tag/elements/operation/ListElementParents).
- Worker Activity API is now disabled when the worker runs in `read-only` mode instead of relying on the `--dev` CLI argument. The [update_activity](https://workers.arkindex.org/ref/elements_worker/#arkindex_worker.worker.ElementsWorker.update_activity) API helper was updated following Arkindex 1.5.1 changes.
+- Worker Activity API is now disabled when the worker runs in `read-only` mode instead of relying on the `--dev` CLI argument. The [update_activity](https://workers.arkindex.org/ref/arkindex_workers/#arkindex_worker.worker.ElementsWorker.update_activity) API helper was updated following Arkindex 1.5.1 changes.
 - Worker can now resize the image of an element when opening them. This uses the [IIIF](https://iiif.io/api/image/2.1/#size) resizing API.
@@ -57,14 +57,14 @@ from arkindex_worker.utils import Timer
 # New usage
 from teklia_toolbox.time import Timer
 ```
- The [create_element_transcriptions](../ref/api/transcription/#arkindex_worker.worker.transcription.TranscriptionMixin.create_element_transcriptions) API helper now accepts an `element_confidence` float field in the dictionaries provided through the `transcriptions` field. This confidence will be set on the created element.
+- The [create_element_transcriptions](ref/api/transcription.md#arkindex_worker.worker.transcription.TranscriptionMixin.create_element_transcriptions) API helper now accepts an `element_confidence` float field in the dictionaries provided through the `transcriptions` field. This confidence will be set on the created element.
- More query filters are available on the [list_element_children](../ref/api/element/#arkindex_worker.worker.element.ElementMixin.list_element_children) API helper. More details about their usage is available in the documentation:
+- More query filters are available on the [list_element_children](ref/api/element.md#arkindex_worker.worker.element.ElementMixin.list_element_children) API helper. More details about their usage is available in the documentation:
    - `transcription_worker_version`
    - `transcription_worker_run`
    - `with_metadata`
    - `worker_run`
 - `Arkindex Base-Worker` now fully uses [pathlib](https://docs.python.org/3/library/pathlib.html) to handle filesystem paths as suggested by [PEP 428](https://peps.python.org/pep-0428/).
- Many helpers were added to handle [ZSTD](https://en.wikipedia.org/wiki/Zstd) and [TAR](https://en.wikipedia.org/wiki/Tar_(computing)) archives as well as delete files cleanly. More details about that in the documentation of the [arkindex_worker.utils](../ref/utils/) module.
+- Many helpers were added to handle [ZSTD](https://en.wikipedia.org/wiki/Zstd) and [TAR](https://en.wikipedia.org/wiki/Tar_(computing)) archives as well as delete files cleanly. More details about that in the documentation of the [arkindex_worker.utils](ref/utils.md) module.
 - A bug affecting the parsing of the configuration of workers that use a Machine learning model stored on an Arkindex instance was fixed.
@@ -78,13 +78,13 @@ Released on **8 March 2023** &bull; View on [Gitlab](https://gitlab.teklia.com/w
 - The model version API endpoints were updated in the [latest Arkindex release](https://teklia.com/solutions/arkindex/releases/1-4-0/) and a new helper was introduced subsequently. However, there are no breaking changes and the main helper, `publish_model_version`, still has the same signature and behaviour.
 - The latest Arkindex release changed the way NER entities are stored and published.
    - The `EntityType` enum was removed as type slug are no longer restrcited to a small options,
-    - [create_entity](../ref/api/entity/#arkindex_worker.worker.entity.EntityMixin.create_entity) now expects a type slug as a String,
+    - [create_entity](ref/api/entity.md#arkindex_worker.worker.entity.EntityMixin.create_entity) now expects a type slug as a String,
-    - a new helper [list_corpus_entity_types](../ref/api/entity/#arkindex_worker.worker.entity.EntityMixin.list_corpus_entity_types) was added to load the Entity types in the corpus,
+    - a new helper [list_corpus_entity_types](ref/api/entity.md#arkindex_worker.worker.entity.EntityMixin.list_corpus_entity_types) was added to load the Entity types in the corpus,
-    - a new helper [check_required_entity_types](../ref/api/entity/#arkindex_worker.worker.entity.EntityMixin.check_required_entity_types) to make sure that needed entity types are available in the corpus was added. Missing ones are created by default (this can be disabled).
+    - a new helper [check_required_entity_types](ref/api/entity.md#arkindex_worker.worker.entity.EntityMixin.check_required_entity_types) to make sure that needed entity types are available in the corpus was added. Missing ones are created by default (this can be disabled).
- The [create_classifications](../ref/api/classification/#arkindex_worker.worker.classification.ClassificationMixin.create_classifications) helper now expects the UUID of each MLClass instead of their name.
+- The [create_classifications](ref/api/classification.md#arkindex_worker.worker.classification.ClassificationMixin.create_classifications) helper now expects the UUID of each MLClass instead of their name.
 - In developer mode, the only way to set the `corpus_id` attribute is to use the `ARKINDEX_CORPUS_ID` environment variable. When it's not set, all API requests using the `corpus_id` as path parameter will fail with `500` status code. A warning log was added to help developers troubleshoot this error by advising them to set this variable.
- The [create_transcriptions](../ref/api/transcription/#arkindex_worker.worker.transcription.TranscriptionMixin.create_transcriptions) helper no longer makes the API call in developer mode. This behaviour aligns with all other publication helpers.
+- The [create_transcriptions](ref/api/transcription.md#arkindex_worker.worker.transcription.TranscriptionMixin.create_transcriptions) helper no longer makes the API call in developer mode. This behaviour aligns with all other publication helpers.
- Fixes hash computation when publishing a model using [publish_model_version](../ref/api/training/#arkindex_worker.worker.training.TrainingMixin.publish_model_version).
+- Fixes hash computation when publishing a model using [publish_model_version](ref/api/training.md#arkindex_worker.worker.training.TrainingMixin.publish_model_version).
 - If a process is linked to a model version, its id will be available to the worker through its `model_version_id` attribute.
 - The URLs of the API endpoint related to Ponos were changed in the latest Arkindex release. Some changes were needed in the test suite.
 - The `classes` attribute no directly contains the classes of the corpus of the processed element.

--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -77,7 +77,7 @@ nav:
      - Configuration: contents/implem/configure.md
  - Python Reference:
      - Base Worker: ref/base_worker.md
-      - Elements Worker: ref/elements_worker.md
+      - Arkindex Workers: ref/arkindex_workers.md
      - Arkindex API integration:
          - ref/api/index.md
          - Classification: ref/api/classification.md
No results found