Compare revisions

Bastien Abadie · Bastien Abadie · Erwan Rouchet · ml bonhomme · Erwan Rouchet · Erwan Rouchet
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,6 +2,9 @@
 .git
 .eggs
 *.egg
-logs
 **/__pycache__/
 **/*.pyc
+docker/
+Makefile
+test-report.xml
+arkindex/config.yml
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,4 @@ htmlcov
 *.key
 arkindex/config.yml
 test-report.xml
+docker/ssl/*.pem
--- a/LICENSE
+++ b/LICENSE
--- a/MANIFEST.in
+++ b/MANIFEST.in
 include VERSION
+include LICENSE
 include requirements.txt
 include base/requirements.txt
 include tests-requirements.txt

--- a/Makefile
+++ b/Makefile
 ROOT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 IMAGE_TAG=registry.gitlab.teklia.com/arkindex/backend

-.PHONY: all release
+.PHONY: all release services

 all: clean build

@@ -20,8 +20,8 @@ worker:

 test-fixtures:
 	$(eval export PGPASSWORD=devdata)
-	psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_dev RENAME TO arkindex_tmp_fixtures' template1
-	psql -h 127.0.0.1 -p 9100 -U devuser -c 'CREATE DATABASE arkindex_dev' template1
+	psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_dev RENAME TO arkindex_tmp_fixtures' template1
+	psql -h 127.0.0.1 -p 5432 -U devuser -c 'CREATE DATABASE arkindex_dev' template1
 	# A "try...finally" block in a Makefile: ensure we bring back the dev database even when test-fixtures fails
 	-$(MAKE) test-fixtures-run
 	$(MAKE) test-fixtures-restore
@@ -33,9 +33,9 @@ test-fixtures-run:

 test-fixtures-restore:
 	# This first renaming ensures that arkindex_tmp_fixtures exists; we don't want to drop arkindex_dev without a backup
-	psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_tmp_fixtures RENAME TO arkindex_dev_replace' template1
-	psql -h 127.0.0.1 -p 9100 -U devuser -c 'DROP DATABASE arkindex_dev' template1
-	psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_dev_replace RENAME TO arkindex_dev' template1
+	psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_tmp_fixtures RENAME TO arkindex_dev_replace' template1
+	psql -h 127.0.0.1 -p 5432 -U devuser -c 'DROP DATABASE arkindex_dev' template1
+	psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_dev_replace RENAME TO arkindex_dev' template1

 require-version:
 	@if [ ! "$(version)" ]; then echo "Missing version to publish"; exit 1; fi
@@ -50,3 +50,21 @@ release:
 	git commit VERSION -m "Version $(version)"
 	git tag $(version)
 	git push origin master $(version)
+
+clean-docker:
+	$(eval containers:=$(shell docker ps -a -q))
+	@if [ -n "$(containers)" ]; then \
+		echo "Cleaning up past containers\n" \
+		docker rm -f $(containers) ; \
+	fi
+
+stack: docker/ssl/ark-cert.pem
+	docker compose -p arkindex up --build
+
+services: docker/ssl/ark-cert.pem
+	docker compose -p arkindex -f docker/docker-compose.services.yml up
+
+docker/ssl/ark-cert.pem:
+	$(eval export CAROOT=$(ROOT_DIR)/docker/ssl)
+	mkcert -install
+	mkcert -cert-file=$(ROOT_DIR)/docker/ssl/ark-cert.pem -key-file=$(ROOT_DIR)/docker/ssl/ark-key.pem ark.localhost *.ark.localhost *.iiif.ark.localhost
--- a/README.md
+++ b/README.md
-Backend for Historical Manuscripts Indexing
-===========================================
+# Arkindex Backend

 [![pipeline status](https://gitlab.teklia.com/arkindex/backend/badges/master/pipeline.svg)](https://gitlab.teklia.com/arkindex/backend/commits/master)
 [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)

+This project is the open-source backend of Arkindex, used to manage and process image documents with Machine Learning tools.
+
+It is licensed under the [AGPL-v3 license](./LICENSE).
+
 ## Requirements

-* Clone of the [architecture](https://gitlab.teklia.com/arkindex/architecture)
 * Git
 * Make
-* Python 3.6+
+* Python 3.10+
 * pip
 * [virtualenvwrapper](https://virtualenvwrapper.readthedocs.io/en/latest/)
+* [Docker 24+](https://docs.docker.com/engine/install/#supported-platforms)
+* [mkcert](https://github.com/FiloSottile/mkcert?tab=readme-ov-file#installation)
+* [GeoDjango system dependencies](https://docs.djangoproject.com/en/3.1/ref/contrib/gis/install/geolibs/): `sudo apt install binutils libproj-dev gdal-bin`

-## Dev Setup
+## Setup for developers

-```
+You'll also need the [Arkindex frontend](https://gitlab.teklia.com/arkindex/frontend) to be able to develop on the whole platform.
+
+```console
 git clone git@gitlab.teklia.com:arkindex/backend.git
+git clone git@gitlab.teklia.com:arkindex/frontend.git
 cd backend
-mkvirtualenv ark -a .
+mkvirtualenv ark -a . -p /usr/bin/python3.10
 pip install -e .[test]
 ```

-When the [architecture](https://gitlab.teklia.com/arkindex/architecture) is running locally to provide required services:
+The Arkindex backend relies on some open-source services to store data and communicate to asynchronous workers.
+To run all the required services, please run in a dedicated shell:

+```console
+make services
 ```
+
+On a first run, you'll need to:
+
+1. Configure the instance by enabling the sample configuration.
+2. Populate the database structure.
+3. Initialize some fields in the database.
+4. Create an administration account.
+
+All of these steps are done through:
+
+```console
+cp config.yml.sample arkindex/config.yml
 arkindex migrate
+arkindex bootstrap
 arkindex createsuperuser
 ```

-### Local configuration
+Finally, you can run the backend:
+
+```console
+arkindex runserver
+```
+
+At this stage, you can use `http://localhost:8000/admin` to access the administration interface.
+
+### Asycnhronous tasks
+
+To run asynchronous tasks, run in another shell:
+
+```console
+make worker
+```

-For development purposes, you can customize the Arkindex settings by adding a YAML file as `arkindex/config.yml`. This file is not tracked by Git; if it exists, any configuration directive set in this file will be used for exposed settings from `settings.py`. You can view the full list of settings [on the wiki](https://wiki.vpn/en/arkindex/deploy/configuration).
+### Dockerized stack

+It is also possible to run the whole Arkindex stack through Docker containers. This is useful to quickly test the platform.

-Another mean to customize your Arkindex instance is to add a Python file in `arkindex/project/local_settings.py`. Here you are not limited to exposed settings, and can customize any setting, or even load Python dependencies at boot time. This is not recommended, as your customization may not be available to real-world Arkindex instances.
+This command will build all the required Docker images (backend & frontend) and run them as Docker containers:

-### ImageMagick setup
+```console
+make stack
+```
+
+You'll be able to access the platform at the url `https://ark.localhost`.

-PDF and image imports in Arkindex will require ImageMagick. Due to its ability to take any computer down if you give it the right parameters (for example, converting a 1000-page PDF file into JPEG files at 30 000 DPI), it has a security policy file. By default, on Ubuntu, PDF conversion is forbidden.
+### Local configuration

-You will need to edit the ImageMagick policy file to get PDF and Image imports to work in Arkindex. The file is located at `/etc/ImageMagick-6/policy.xml`.
+For development purposes, you can customize the Arkindex settings by adding a YAML file as `arkindex/config.yml`. This file is not tracked by Git; if it exists, any configuration directive set in this file will be used for exposed settings from `settings.py`. You can view the full list of settings [on the wiki](https://redmine.teklia.com/projects/arkindex/wiki/Backend_configuration).

-The line that sets the PDF policy is `<policy domain="coder" rights="none" pattern="PDF" />`. Replace `none` with `read|write` for it to work. See [this StackOverflow question](https://stackoverflow.com/questions/52998331) for more info.
+Another way to customize your Arkindex instance is to add a Python file in `arkindex/project/local_settings.py`. Here you are not limited to exposed settings, and can customize any setting, or even load Python dependencies at boot time. This is not recommended, as your customization may not be available to real-world Arkindex instances.

 ### Local image server

@@ -54,7 +97,7 @@ local_imageserver_id: 999

 Here is how to quickly create the ImageServer using the shell:

-```
+```python
 $ arkindex shell
 >>> from arkindex.images.models import ImageServer
 >>> ImageServer.objects.create(id=1, display_name='local', url='https://ark.localhost/iiif')
@@ -62,11 +105,6 @@ $ arkindex shell

 Note that this local server will only work inside Docker.

-### User groups
-
-We use a custom group model in `arkindex.users.models` (not the `django.contrib.auth` one).
-In this early version groups do not define any right yet.
-
 ## Usage

 ### Makefile
@@ -76,6 +114,7 @@ At the root of the repository is a Makefile that provides commands for common op
 * `make` or `make all`: Clean and build;
 * `make base`: Create and push the `arkindex-base` Docker image that is used to build the `arkindex-app` image;
 * `make clean`: Cleanup the Python package build and cache files;
+* `make clean-docker`: Deletes all running containers to avoid naming and network ports conflicts;
 * `make build`: Build the arkindex Python package and recreate the `arkindex-app:latest` without pushing to the GitLab container registry;
 * `make test-fixtures`: Create the unit tests fixtures on a temporary PostgreSQL database and save them to the `data.json` file used by most Django unit tests.

@@ -83,14 +122,10 @@ At the root of the repository is a Makefile that provides commands for common op

 Aside from the usual Django commands, some custom commands are available via `arkindex`:

-* `build_fixtures`: Create a set of database elements designed for use by unit tests in a fixture (see `make test-fixtures`);
-* `from_csv`: Import manifests and index files from a CSV list;
-* `import_annotations`: Import index files from a folder into a specific volume;
-* `import_acts`: Import XML surface files and CSV act files;
-* `delete_corpus`: Delete a big corpus using an RQ task;
-* `reindex`: Reindex elements into Solr;
-* `telegraf`: A special command with InfluxDB-compatible output for Grafana statistics.
-* `move_lines_to_parents`: Moves element children to their geographical parents;
+* `build_fixtures`: Create a set of database elements designed for use by unit tests in a fixture (see `make test-fixtures`).
+* `delete_corpus`: Delete a big corpus using an RQ task.
+* `reindex`: Reindex elements into Solr.
+* `move_lines_to_parents`: Moves element children to their geographical parents.

 See `arkindex <command> --help` to view more details about a specific command.

@@ -108,9 +143,9 @@ We use [pre-commit](https://pre-commit.com/) to check the Python source code syn

 To be efficient, you should run pre-commit before committing (hence the name...).

-To do that, run once :
+To do that, run once:

-```
+```console
 pip install pre-commit
 pre-commit install
 ```
@@ -127,9 +162,9 @@ IPython will give you a nicer shell with syntax highlighting, auto reloading and

 [Django Debug Toolbar](https://django-debug-toolbar.readthedocs.io/en/latest/) provides you with a neat debug sidebar that will help diagnosing slow API endpoints or weird template bugs. Since the Arkindex frontend is completely decoupled from the backend, you will need to browse to an API endpoint to see the debug toolbar.

-[Django Extensions](https://django-extensions.readthedocs.io/en/latest/) adds a *lot* of `arkindex` commands ; the most important one is `arkindex shell_plus` which runs the usual shell but with all the available models pre-imported. You can add your own imports with the `local_settings.py` file. Here is an example that imports most of the backend's enums and some special QuerySet features:
+[Django Extensions](https://django-extensions.readthedocs.io/en/latest/) adds a *lot* of `arkindex` commands ; the most important one is `arkindex shell_plus` which runs the usual shell but with all the available models pre-imported. You can add your own imports with the `local_settings.py` file. Here is an example that imports some of the backend's enums and some special QuerySet features:

-``` python
+```python
 SHELL_PLUS_POST_IMPORTS = [
    ('django.db.models', ('Value', )),
    ('django.db.models.functions', '*'),
@@ -138,7 +173,7 @@ SHELL_PLUS_POST_IMPORTS = [
        'Right',
    )),
    ('arkindex.process.models', (
-        'DataImportMode',
+        'ProcessMode',
    )),
    ('arkindex.project.aws', (
        'S3FileStatus',
@@ -148,23 +183,33 @@ SHELL_PLUS_POST_IMPORTS = [

 ## Asynchronous tasks

-We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org/project/django-rq/), to run tasks without blocking an API request or causing timeouts. To call them in Python code, you should use the trigger methods in `arkindex.project.triggers`; those will do some safety checks to make catching some errors easier in dev. The actual tasks are in `arkindex.documents.tasks`. The following tasks exist:
+We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org/project/django-rq/), to run tasks without blocking an API request or causing timeouts. To call them in Python code, you should use the trigger methods in `arkindex.project.triggers`; those will do some safety checks to make catching some errors easier in dev. The actual tasks are in `arkindex.documents.tasks`, or in other `tasks` modules within each Django app. The following tasks exist:

 * Delete a corpus: `corpus_delete`
 * Delete a list of elements: `element_trash`
 * Delete worker results (transcriptions, classifications, etc. of a worker version): `worker_results_delete`
 * Move an element to another parent: `move_element`
-* Create `WorkerActivity` instances for all elements of a process: `intitialize_activity`
+* Create `WorkerActivity` instances for all elements of a process: `initialize_activity`
 * Delete a process and its worker activities: `process_delete`
 * Export a corpus to an SQLite database: `export_corpus`

-To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make slim` or `make` in the architecture will provide it. `make` in the architecture also provides a RQ worker running in Docker from a binary build.
-
-Process tasks are run in RQ by default (Community Edition). Two RQ workers must be running at the same time to actually run a process with worker activities, so the initialisation task can wait for the worker activity task to finish:
-```sh
-$ arkindex rqworker -v 3 default high & arkindex rqworker -v 3 tasks
-```
+To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make services` will provide it. `make stack` also provides an RQ worker running in Docker from a binary build.

 ## Metrics
+
 The application serves metrics for Prometheus under the `/metrics` prefix.
 A specific port can be used by setting the `PROMETHEUS_METRICS_PORT` environment variable, thus separating the application from the metrics API.
+
+## Migration from `architecture` setup
+
+If you were using the `architecture` repository previously to run Arkindex, you'll need to migrate MinIO data from a static path on your computer towards a new docker volume.
+
+```console
+docker volume create arkindex_miniodata
+mv /usr/share/arkindex/s3/data/iiif /var/lib/docker/volumes/arkindex_miniodata/_data/uploads
+mv /usr/share/arkindex/s3/data/{export,iiif-cache,ponos-logs,ponos-artifacts,staging,thumbnails,training} /var/lib/docker/volumes/arkindex_miniodata/_data/
+```
+
+You will also need to setup [mkcert](https://github.com/FiloSottile/mkcert?tab=readme-ov-file#installation) as we do not use Teklia development Certificate Authority anymore. `mkcert` will take care of SSL certificates automatically, updating your browsers and system certificate store !
+
+Finally, you can remove the `architecture` project from your work folder, as it's now archived and could be confusing.
--- a/VERSION
+++ b/VERSION
-1.5.4
+1.6.0-beta3
--- a/arkindex/documents/api/elements.py
+++ b/arkindex/documents/api/elements.py
@@ -67,7 +67,7 @@ from arkindex.documents.serializers.elements import (
    ElementNeighborsSerializer,
    ElementParentSerializer,
    ElementSerializer,
-    ElementSlimSerializer,
+    ElementTinySerializer,
    ElementTypeSerializer,
    MetaDataBulkSerializer,
    MetaDataCreateSerializer,
@@ -1410,7 +1410,7 @@ class ManageSelection(SelectionMixin, ListAPIView):
    @extend_schema(
        operation_id="AddSelection",
        description="Add specific elements",
-        responses={201: ElementSlimSerializer},
+        responses={201: ElementTinySerializer},
        request=inline_serializer(
            name="AddSelectionBodySerializer",
            fields={"ids": serializers.ListField(child=serializers.UUIDField())}
@@ -1450,7 +1450,7 @@ class ManageSelection(SelectionMixin, ListAPIView):
        prefetch_related_objects(elements, "corpus", "image__server", "type")
        return Response(
            status=status.HTTP_201_CREATED,
-            data=ElementSlimSerializer(
+            data=ElementTinySerializer(
                elements,
                context={"request": request},
                many=True

--- a/arkindex/documents/api/entities.py
+++ b/arkindex/documents/api/entities.py
@@ -21,7 +21,7 @@ from arkindex.documents.models import (
    Transcription,
    TranscriptionEntity,
 )
-from arkindex.documents.serializers.elements import ElementSlimSerializer
+from arkindex.documents.serializers.elements import ElementTinySerializer
 from arkindex.documents.serializers.entities import (
    BaseEntitySerializer,
    CreateEntityRoleErrorResponseSerializer,
@@ -218,7 +218,7 @@ class EntityElements(ListAPIView):
    """
    Get all elements that have a link with the entity
    """
-    serializer_class = ElementSlimSerializer
+    serializer_class = ElementTinySerializer
    # For OpenAPI type discovery: an entity's ID is in the path
    queryset = Entity.objects.none()


--- a/arkindex/documents/export/__init__.py
+++ b/arkindex/documents/export/__init__.py
@@ -51,7 +51,13 @@ def run_pg_query(query, source_db):
    Run a single Postgresql query and split the results into chunks.
    When a name is given to a cursor, psycopg2 uses a server-side cursor; we just use a random string as a name.
    """
-    with connections[source_db].create_cursor(name=str(uuid.uuid4())) as pg_cursor:
+    db = connections[source_db]
+
+    # Make sure a connection is open and available for export databases
+    if source_db != "default" and db.connection is None:
+        db.connect()
+
+    with db.create_cursor(name=str(uuid.uuid4())) as pg_cursor:
        pg_cursor.itersize = BATCH_SIZE
        pg_cursor.execute(query)


--- a/arkindex/documents/export/dataset.sql
+++ b/arkindex/documents/export/dataset.sql
@@ -2,6 +2,8 @@ SELECT
    dataset.id,
    dataset.name,
    dataset.state,
-    ARRAY_TO_STRING(dataset.sets, ',', '')
+    string_agg(datasetset.name, ',')
 FROM training_dataset dataset
+INNER JOIN training_datasetset datasetset ON datasetset.dataset_id = dataset.id
 WHERE dataset.corpus_id = '{corpus_id}'::uuid
+GROUP BY dataset.id
--- a/arkindex/documents/export/dataset_element.sql
+++ b/arkindex/documents/export/dataset_element.sql
 SELECT
    dataset_element.id,
    dataset_element.element_id,
-    dataset_element.dataset_id,
-    dataset_element.set
+    dataset_set.dataset_id,
+    dataset_set.name
 FROM training_datasetelement dataset_element
-INNER JOIN training_dataset dataset ON (dataset_element.dataset_id = dataset.id)
+INNER JOIN training_datasetset dataset_set ON (dataset_element.set_id = dataset_set.id)
+INNER JOIN training_dataset dataset ON (dataset_set.dataset_id = dataset.id)
 WHERE dataset.corpus_id = '{corpus_id}'::uuid
--- a/arkindex/documents/fixtures/data.json
+++ b/arkindex/documents/fixtures/data.json
--- a/arkindex/documents/management/commands/bootstrap.py
+++ b/arkindex/documents/management/commands/bootstrap.py
@@ -14,9 +14,14 @@ from arkindex.process.models import FeatureUsage, Repository, Worker, WorkerType
 from arkindex.users.models import User

 # Constants used in architecture project
-IMAGE_SERVER_ID = 12345
-IMAGE_SERVER_BUCKET = "iiif"
-IMAGE_SERVER_REGION = "local"
+UPLOADS_IMAGE_SERVER_ID = 12345
+UPLOADS_IMAGE_SERVER_URL = "https://uploads.iiif.ark.localhost/iiif/2"
+UPLOADS_IMAGE_SERVER_BUCKET = "uploads"
+UPLOADS_IMAGE_SERVER_REGION = "local"
+INGEST_IMAGE_SERVER_ID = 67890
+INGEST_IMAGE_SERVER_URL = "https://ingest.iiif.ark.localhost/iiif/2"
+INGEST_IMAGE_SERVER_BUCKET = "ingest"
+INGEST_IMAGE_SERVER_REGION = "local"
 PONOS_FARM_ID = "001e411a-1111-2222-3333-444455556666"
 PONOS_FARM_NAME = "Bootstrap farm"
 PONOS_FARM_SEED = "b12868101dab84984481741663d809d2393784894d6e807ceee0bd95051bf971"
@@ -52,6 +57,46 @@ class Command(BaseCommand):
            user.save()
            self.warn(f"Updated user {user} to admin")

+    def create_image_server(self, id, url, bucket, region, display_name):
+        try:
+            server = ImageServer.objects.get(Q(id=id) | Q(url=url))
+            if server.id != id:
+                # Migrate existing images & server id in a single transaction
+                with transaction.atomic():
+                    server.images.update(server_id=id)
+                    ImageServer.objects.filter(id=server.id).update(id=id)
+                self.warn(f"Image server {server.id} updated to {id}")
+
+                # Update internal reference for updates below
+                server.id = id
+
+            if server.url != url:
+                server.url = url
+                server.save()
+
+            # Update base settings
+            if server.s3_bucket != bucket or server.s3_region != region:
+                server.s3_bucket = bucket
+                server.s3_region = region
+                server.save()
+                self.warn(f"Updated image server {server.id} S3 settings")
+            else:
+                self.success(f"Image server {server.id} valid")
+        except ImageServer.DoesNotExist:
+            try:
+                server = ImageServer.objects.create(
+                    id=id,
+                    url=url,
+                    s3_bucket=bucket,
+                    s3_region=region,
+                    display_name=display_name,
+                )
+                self.success(f"Image server {server.id} created")
+            except IntegrityError as e:
+                self.fail(f"Failed to create image server: {e}")
+                return
+        return server
+
    def handle(self, **options):
        # Never allow running this script in production
        if not settings.DEBUG:
@@ -108,47 +153,18 @@ class Command(BaseCommand):
            self.success(f"Created token {ADMIN_API_TOKEN}")

        # an image server for local cantaloupe https://ark.localhost/iiif/2
-        try:
-            server = ImageServer.objects.get(url="https://ark.localhost/iiif/2")
-            if server.id != IMAGE_SERVER_ID:
-                # Migrate existing images & server id in a single transaction
-                with transaction.atomic():
-                    server.images.update(server_id=IMAGE_SERVER_ID)
-                    ImageServer.objects.filter(id=server.id).update(id=IMAGE_SERVER_ID)
-                self.warn(f"Image server {server.id} updated to {IMAGE_SERVER_ID}")
-
-                # Update internal reference for updates below
-                server.id = IMAGE_SERVER_ID
-
-            # Update base settings
-            if server.s3_bucket != IMAGE_SERVER_BUCKET or server.s3_region != IMAGE_SERVER_REGION:
-                server.s3_bucket = IMAGE_SERVER_BUCKET
-                server.s3_region = IMAGE_SERVER_REGION
-                server.save()
-                self.warn("Updated image server S3 settings")
-            else:
-                self.success(f"Image server {server.id} valid")
-        except ImageServer.DoesNotExist:
-            try:
-                server = ImageServer.objects.create(
-                    id=IMAGE_SERVER_ID,
-                    url="https://ark.localhost/iiif/2",
-                    s3_bucket=IMAGE_SERVER_BUCKET,
-                    s3_region=IMAGE_SERVER_REGION,
-                    display_name="Development local server",
-                )
-                self.success("Image server created")
-            except IntegrityError as e:
-                self.fail(f"Failed to create image server: {e}")
-                return
+        uploads_server = self.create_image_server(UPLOADS_IMAGE_SERVER_ID , UPLOADS_IMAGE_SERVER_URL, UPLOADS_IMAGE_SERVER_BUCKET , UPLOADS_IMAGE_SERVER_REGION , "Local IIIF server for user uploaded files through frontend")
+        if uploads_server is None:
+            return
+        self.create_image_server(INGEST_IMAGE_SERVER_ID , INGEST_IMAGE_SERVER_URL, INGEST_IMAGE_SERVER_BUCKET , INGEST_IMAGE_SERVER_REGION , "Local IIIF server for ingested files from minio")

        # Check there is not already a local server with invalid path
        # We'll merge its image into the new one
        # This bad server may have been created by automatic IIIF server detection
        try:
-            bad_server = ImageServer.objects.get(url="https://ark.localhost/iiif")
-            bad_server.merge_into(server)
-            self.warn(f"Merged images from {bad_server.id} into {server.id}")
+            bad_server = ImageServer.objects.get(url="https://uploads.iiif.ark.localhost/iiif")
+            bad_server.merge_into(uploads_server)
+            self.warn(f"Merged images from {bad_server.id} into {uploads_server.id}")

            bad_server.delete()
            self.warn("Deleted old server")
@@ -194,17 +210,21 @@ class Command(BaseCommand):
            )
            self.success(f"Created revision {revision.hash}")

-        version, created = worker.versions.get_or_create(
-            revision=revision,
-            defaults={
-                "id": IMPORT_WORKER_VERSION_ID,
-                "configuration": {},
-                "state": WorkerVersionState.Created,
-                "gpu_usage": FeatureUsage.Disabled,
-                "docker_image": None,
-                "docker_image_iid": None,
-            }
-        )
+        try:
+            version = WorkerVersion.objects.get(id=IMPORT_WORKER_VERSION_ID)
+            created = False
+        except WorkerVersion.DoesNotExist:
+            version, created = worker.versions.get_or_create(
+                revision=revision,
+                defaults={
+                    "id": IMPORT_WORKER_VERSION_ID,
+                    "configuration": {},
+                    "state": WorkerVersionState.Created,
+                    "gpu_usage": FeatureUsage.Disabled,
+                    "docker_image": None,
+                    "docker_image_iid": None,
+                }
+            )
        if created:
            self.success(f"Created worker version {version.slug}")
        else:

--- a/arkindex/documents/management/commands/build_fixtures.py
+++ b/arkindex/documents/management/commands/build_fixtures.py
 #!/usr/bin/env python3
-from datetime import datetime, timezone
 from unittest.mock import patch

 from django.contrib.gis.geos import LinearRing
@@ -8,7 +7,7 @@ from django.utils import timezone as DjangoTimeZone

 from arkindex.documents.models import Corpus, Element, MetaData, MetaType
 from arkindex.images.models import Image, ImageServer
-from arkindex.ponos.models import Farm, State
+from arkindex.ponos.models import Farm
 from arkindex.process.models import (
    FeatureUsage,
    Process,
@@ -21,6 +20,7 @@ from arkindex.process.models import (
    WorkerVersionState,
 )
 from arkindex.project.tools import fake_now
+from arkindex.training.models import DatasetSet
 from arkindex.users.models import Group, Right, Role, User


@@ -104,23 +104,7 @@ class Command(BaseCommand):
        farm = Farm.objects.create(name="Wheat farm")
        farm.memberships.create(user=user, level=Role.Guest.value)

-        # Create a fake docker build with a docker image task
-        build_process = Process.objects.create(
-            farm=farm,
-            creator=superuser,
-            mode=ProcessMode.Repository,
-        )
-        build_task = build_process.tasks.create(
-            run=0,
-            depth=0,
-            slug="docker_build",
-            state=State.Completed,
-            # Use an expiry very far away so that task is never expired
-            expiry=datetime(2100, 12, 31, 23, 59, 59, 999999, timezone.utc),
-        )
-        docker_image = build_task.artifacts.create(size=42_000, path="/path/to/docker_build")
-
-        # Create some workers for the repository with their available version
+        # Create some workers with available versions
        recognizer_worker = WorkerVersion.objects.create(
            worker=worker_repo.workers.create(
                name="Recognizer",
@@ -131,7 +115,7 @@ class Command(BaseCommand):
            configuration={"test": 42},
            state=WorkerVersionState.Available,
            model_usage=FeatureUsage.Disabled,
-            docker_image=docker_image
+            docker_image_iid="registry.somewhere.com/something:latest"
        )
        dla_worker = WorkerVersion.objects.create(
            worker=worker_repo.workers.create(
@@ -143,7 +127,7 @@ class Command(BaseCommand):
            configuration={"test": 42},
            state=WorkerVersionState.Available,
            model_usage=FeatureUsage.Disabled,
-            docker_image=docker_image
+            docker_image_iid="registry.somewhere.com/something:latest"
        )

        WorkerVersion.objects.create(
@@ -156,7 +140,7 @@ class Command(BaseCommand):
            configuration={},
            state=WorkerVersionState.Available,
            model_usage=FeatureUsage.Disabled,
-            docker_image=docker_image,
+            docker_image_iid="registry.somewhere.com/something:latest"
        )

        WorkerVersion.objects.create(
@@ -169,7 +153,7 @@ class Command(BaseCommand):
            configuration={"test": 42},
            state=WorkerVersionState.Available,
            model_usage=FeatureUsage.Disabled,
-            docker_image=docker_image,
+            docker_image_iid="registry.somewhere.com/something:latest",
            gpu_usage=FeatureUsage.Required
        )

@@ -185,7 +169,7 @@ class Command(BaseCommand):
            state=WorkerVersionState.Available,
            gpu_usage=FeatureUsage.Disabled,
            model_usage=FeatureUsage.Required,
-            docker_image=docker_image
+            docker_image_iid="registry.somewhere.com/something:latest"
        )

        # Create a custom worker version that is not linked to a Git repository/revision
@@ -288,8 +272,15 @@ class Command(BaseCommand):
        )

        # Create 2 datasets
-        corpus.datasets.create(name="First Dataset", description="dataset number one", creator=user)
-        corpus.datasets.create(name="Second Dataset", description="dataset number two", creator=user)
+        dataset_1 = corpus.datasets.create(name="First Dataset", description="dataset number one", creator=user)
+        dataset_2 = corpus.datasets.create(name="Second Dataset", description="dataset number two", creator=user)
+        # Create their sets
+        DatasetSet.objects.bulk_create(
+            DatasetSet(name=name, dataset_id=dataset_1.id) for name in ["training", "validation", "test"]
+        )
+        DatasetSet.objects.bulk_create(
+            DatasetSet(name=name, dataset_id=dataset_2.id) for name in ["training", "validation", "test"]
+        )

        # Create 2 volumes
        vol1 = Element.objects.create(

--- a/arkindex/documents/management/commands/load_export.py
+++ b/arkindex/documents/management/commands/load_export.py
@@ -37,7 +37,7 @@ from arkindex.process.models import (
    WorkerType,
    WorkerVersion,
 )
-from arkindex.training.models import Dataset, DatasetElement, Model
+from arkindex.training.models import Dataset, DatasetElement, DatasetSet, Model
 from arkindex.users.models import Role, User

 EXPORT_VERSION = 8
@@ -320,17 +320,30 @@ class Command(BaseCommand):
            id=row["id"],
            corpus=self.corpus,
            name=row["name"],
-            sets=[r.strip() for r in row["sets"].split(",")],
            creator=self.user,
            description="Imported dataset",
        )]

+    def convert_dataset_sets(self, row):
+        return [
+            DatasetSet(
+                name=set_name.strip(),
+                dataset_id=row["id"]
+            )
+            for set_name in row["sets"].split(",")
+        ]
+
+    def map_dataset_sets(self):
+        return {
+            (str(set.dataset_id), set.name): set.id
+            for set in DatasetSet.objects.filter(dataset__corpus=self.corpus)
+        }
+
    def convert_dataset_elements(self, row):
        return [DatasetElement(
            id=row["id"],
            element_id=row["element_id"],
-            dataset_id=row["dataset_id"],
-            set=row["set_name"],
+            set_id=self.dataset_sets_map[(row["dataset_id"], row["set_name"])]
        )]

    def bulk_create_objects(self, ModelClass, convert_method, sql_query, ignore_conflicts=True):
@@ -603,6 +616,12 @@ class Command(BaseCommand):
            # Create datasets
            self.bulk_create_objects(Dataset, self.convert_datasets, SQL_DATASET_QUERY)

+            # Create dataset sets
+            self.bulk_create_objects(DatasetSet, self.convert_dataset_sets, SQL_DATASET_QUERY)
+
+            # Create dataset sets mapping
+            self.dataset_sets_map = self.map_dataset_sets()
+
            # Create dataset elements
            self.bulk_create_objects(DatasetElement, self.convert_dataset_elements, SQL_ELEMENT_DATASET_QUERY)


--- a/arkindex/documents/serializers/elements.py
+++ b/arkindex/documents/serializers/elements.py
 import math
 import uuid
 from collections import defaultdict
+from functools import cached_property
 from textwrap import dedent

 from django.conf import settings
@@ -23,7 +24,6 @@ from arkindex.documents.serializers.light import (
 from arkindex.documents.serializers.ml import ClassificationSerializer, WorkerRunSummarySerializer
 from arkindex.images.models import Image
 from arkindex.images.serializers import ZoneSerializer
-from arkindex.ponos.utils import get_process_from_task_auth
 from arkindex.process.models import WorkerVersion
 from arkindex.project.fields import Array
 from arkindex.project.mixins import SelectionMixin
@@ -429,29 +429,6 @@ class ElementTinySerializer(serializers.ModelSerializer):
        )


-class ElementSlimSerializer(ElementTinySerializer):
-    """
-    Fully serialises a document
-    """
-    thumbnail_put_url = serializers.SerializerMethodField(read_only=True)
-
-    @extend_schema_field(serializers.CharField(allow_null=True))
-    def get_thumbnail_put_url(self, element):
-        """
-        Only set the Thumbnail PUT URL for Ponos tasks that
-        are running the thumbnails generation on a folder.
-        """
-        if element.type.folder:
-            process = get_process_from_task_auth(self.context["request"])
-            if process and process.generate_thumbnails:
-                return element.thumbnail.s3_put_url
-
-    class Meta(ElementTinySerializer.Meta):
-        model = Element
-        fields = ElementTinySerializer.Meta.fields + ("thumbnail_put_url",)
-        read_only_fields = ElementTinySerializer.Meta.read_only_fields + ("thumbnail_put_url",)
-
-
 @extend_schema_serializer(deprecate_fields=("worker_version_id", ))
 class ElementListSerializer(ElementTinySerializer):
    created = serializers.DateTimeField(read_only=True)
@@ -555,7 +532,7 @@ class ElementParentSerializer(serializers.Serializer):


 @extend_schema_serializer(deprecate_fields=("worker_version", ))
-class ElementSerializer(ElementSlimSerializer):
+class ElementSerializer(ElementTinySerializer):
    """
    Serialize an element with its metadata and classifications
    """
@@ -591,9 +568,11 @@ class ElementSerializer(ElementSlimSerializer):

    worker_run = WorkerRunSummarySerializer(read_only=True, allow_null=True)

+    thumbnail_put_url = serializers.SerializerMethodField(read_only=True)
+
    class Meta:
        model = Element
-        fields = ElementSlimSerializer.Meta.fields + (
+        fields = ElementTinySerializer.Meta.fields + (
            "created",
            "creator",
            "rights",
@@ -603,32 +582,57 @@ class ElementSerializer(ElementSlimSerializer):
            "polygon",
            "worker_version",
            "confidence",
-            "worker_run"
+            "worker_run",
+            "thumbnail_put_url",
        )
-        read_only_fields = ElementSlimSerializer.Meta.read_only_fields + (
+        read_only_fields = ElementTinySerializer.Meta.read_only_fields + (
            "created",
            "creator",
            "rights",
            "metadata_count",
            "classifications",
            "worker_version",
-            "worker_run"
+            "worker_run",
+            "thumbnail_put_url",
        )

-    @extend_schema_field(serializers.ListField(child=serializers.ChoiceField(["read", "write", "admin"])))
-    def get_rights(self, element):
+    @cached_property
+    def element_rights(self):
+        if not self.instance:
+            return
+
        user = self.context["request"].user
-        level = get_max_level(user, element.corpus)
+        level = get_max_level(user, self.instance.corpus)
+
+        # Admin access is granted to both corpus admins and element creators that are corpus contributors
+        if level >= Role.Contributor.value and self.instance.creator_id == user.id:
+            return Role.Admin.value

+        return level
+
+    @extend_schema_field(serializers.ListField(child=serializers.ChoiceField(["read", "write", "admin"])))
+    def get_rights(self, element):
        rights = ["read"]
-        if level >= Role.Contributor.value:
+        if self.element_rights >= Role.Contributor.value:
            rights.append("write")
-        # Admin access is granted to both corpus admins and element creators
-        if level >= Role.Admin.value or (level >= Role.Contributor.value and element.creator_id == user.id):
+        if self.element_rights >= Role.Admin.value:
            rights.append("admin")

        return rights

+    @extend_schema_field(serializers.CharField(
+        allow_null=True,
+        help_text=dedent("""
+            URL where a PUT request may be sent to upload a new thumbnail for this element.
+
+            Only available on folder elements.
+            Requires **admin** access to the corpus, or **contributor** access to the corpus and to be the element's creator.
+        """),
+    ))
+    def get_thumbnail_put_url(self, element):
+        if element.type.folder and self.element_rights >= Role.Admin.value:
+            return element.thumbnail.s3_put_url
+
    def update(self, instance, validated_data):
        image = validated_data.pop("image", None)
        polygon = validated_data.pop("polygon", None)

--- a/arkindex/documents/tasks.py
+++ b/arkindex/documents/tasks.py
@@ -23,8 +23,8 @@ from arkindex.documents.models import (
    TranscriptionEntity,
 )
 from arkindex.ponos.models import Task
-from arkindex.process.models import Process, ProcessDataset, ProcessElement, WorkerActivity, WorkerRun
-from arkindex.training.models import DatasetElement
+from arkindex.process.models import Process, ProcessDatasetSet, ProcessElement, WorkerActivity, WorkerRun
+from arkindex.training.models import DatasetElement, DatasetSet
 from arkindex.users.models import User

 logger = logging.getLogger(__name__)
@@ -70,10 +70,11 @@ def corpus_delete(corpus_id: str) -> None:
        Selection.objects.filter(element__corpus_id=corpus_id),
        corpus.memberships.all(),
        corpus.exports.all(),
-        # ProcessDataset M2M
-        ProcessDataset.objects.filter(dataset__corpus_id=corpus_id),
-        ProcessDataset.objects.filter(process__corpus_id=corpus_id),
-        DatasetElement.objects.filter(dataset__corpus_id=corpus_id),
+        # ProcessDatasetSet M2M
+        ProcessDatasetSet.objects.filter(set__dataset__corpus_id=corpus_id),
+        ProcessDatasetSet.objects.filter(process__corpus_id=corpus_id),
+        DatasetElement.objects.filter(set__dataset__corpus_id=corpus_id),
+        DatasetSet.objects.filter(dataset__corpus_id=corpus_id),
        corpus.datasets.all(),
        # Delete the hidden M2M task parents table
        Task.parents.through.objects.filter(from_task__process__corpus_id=corpus_id),

--- a/arkindex/documents/tests/commands/test_load_export.py
+++ b/arkindex/documents/tests/commands/test_load_export.py
@@ -14,6 +14,7 @@ from arkindex.documents.tasks import corpus_delete
 from arkindex.images.models import Image, ImageServer
 from arkindex.process.models import ProcessMode, Repository, Worker, WorkerRun, WorkerType, WorkerVersion
 from arkindex.project.tests import FixtureTestCase
+from arkindex.training.models import Dataset, DatasetElement

 BASE_DIR = Path(__file__).absolute().parent

@@ -132,6 +133,9 @@ class TestLoadExport(FixtureTestCase):
        dla_version = WorkerVersion.objects.get(worker__slug="dla")
        dla_run = dla_version.worker_runs.get(process__mode=ProcessMode.Workers)

+        dataset_set = Dataset.objects.first().sets.first()
+        DatasetElement.objects.create(set=dataset_set, element=element)
+
        element.classifications.create(
            ml_class=self.corpus.ml_classes.create(name="Blah"),
            confidence=.55555555,
@@ -266,6 +270,9 @@ class TestLoadExport(FixtureTestCase):
            confidence=.55555555,
        )

+        dataset_set = Dataset.objects.first().sets.first()
+        DatasetElement.objects.create(set=dataset_set, element=element)
+
        person_type = EntityType.objects.get(
            name="person",
            corpus=self.corpus

--- a/arkindex/documents/tests/tasks/test_corpus_delete.py
+++ b/arkindex/documents/tests/tasks/test_corpus_delete.py
@@ -3,9 +3,16 @@ from django.db.models.signals import pre_delete
 from arkindex.documents.models import Corpus, Element, EntityType, MetaType, Transcription
 from arkindex.documents.tasks import corpus_delete
 from arkindex.ponos.models import Farm, State, Task
-from arkindex.process.models import CorpusWorkerVersion, ProcessDataset, ProcessMode, Repository, WorkerVersion
+from arkindex.process.models import (
+    CorpusWorkerVersion,
+    Process,
+    ProcessDatasetSet,
+    ProcessMode,
+    Repository,
+    WorkerVersion,
+)
 from arkindex.project.tests import FixtureTestCase, force_constraints_immediate
-from arkindex.training.models import Dataset
+from arkindex.training.models import Dataset, DatasetSet


 class TestDeleteCorpus(FixtureTestCase):
@@ -114,25 +121,32 @@ class TestDeleteCorpus(FixtureTestCase):
        cls.corpus2 = Corpus.objects.create(name="Other corpus")

        dataset1 = Dataset.objects.get(name="First Dataset")
-        dataset1.dataset_elements.create(element=element, set="test")
+        test_set_1 = dataset1.sets.get(name="test")
+        test_set_1.set_elements.create(element=element)
        cls.dataset2 = Dataset.objects.create(name="Dead Sea Scrolls", description="How to trigger a Third Impact", creator=cls.user, corpus=cls.corpus2)
-        # Process on cls.corpus and with a dataset from cls.corpus
+        DatasetSet.objects.bulk_create(
+            DatasetSet(
+                dataset=cls.dataset2,
+                name=set_name
+            ) for set_name in ["test", "training", "validation"]
+        )
+        # Process on cls.corpus and with a set from cls.corpus
        dataset_process1 = cls.corpus.processes.create(creator=cls.user, mode=ProcessMode.Dataset)
-        ProcessDataset.objects.create(process=dataset_process1, dataset=dataset1, sets=dataset1.sets)
-        # Process on cls.corpus with a dataset from another corpus
+        ProcessDatasetSet.objects.create(process=dataset_process1, set=test_set_1)
+        # Process on cls.corpus with a set from another corpus
        dataset_process2 = cls.corpus.processes.create(creator=cls.user, mode=ProcessMode.Dataset)
-        ProcessDataset.objects.create(process=dataset_process2, dataset=dataset1, sets=dataset1.sets)
-        ProcessDataset.objects.create(process=dataset_process2, dataset=cls.dataset2, sets=cls.dataset2.sets)
-        # Process on another corpus with a dataset from another corpus and none from cls.corpus
+        ProcessDatasetSet.objects.create(process=dataset_process2, set=test_set_1)
+        ProcessDatasetSet.objects.create(process=dataset_process2, set=cls.dataset2.sets.get(name="training"))
+        # Process on another corpus with a set from another corpus and none from cls.corpus
        cls.dataset_process3 = cls.corpus2.processes.create(creator=cls.user, mode=ProcessMode.Dataset)
-        ProcessDataset.objects.create(process=cls.dataset_process3, dataset=cls.dataset2, sets=cls.dataset2.sets)
+        ProcessDatasetSet.objects.create(process=cls.dataset_process3, set=cls.dataset2.sets.get(name="validation"))

        cls.rev = cls.repo.revisions.create(
            hash="42",
            message="oh",
            author="me",
        )
-        cls.process = cls.rev.processes.create(
+        cls.process = Process.objects.create(
            creator=cls.user,
            corpus=cls.corpus2,
            mode=ProcessMode.Files,
@@ -204,7 +218,6 @@ class TestDeleteCorpus(FixtureTestCase):
        self.dataset_process3.refresh_from_db()

        self.assertTrue(self.repo.revisions.filter(id=self.rev.id).exists())
-        self.assertEqual(self.process.revision, self.rev)
        self.assertEqual(self.process.files.get(), self.df)
        self.assertTrue(Element.objects.get_descending(self.vol.id).filter(id=self.page.id).exists())
        self.assertTrue(self.corpus2.datasets.filter(id=self.dataset2.id).exists())
No results found