Compare revisions

ml bonhomme · Erwan Rouchet · ml bonhomme · Erwan Rouchet · ml bonhomme · Erwan Rouchet
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,6 +2,9 @@
 .git
 .eggs
 *.egg
-logs
 **/__pycache__/
 **/*.pyc
+docker/
+Makefile
+test-report.xml
+arkindex/config.yml
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,4 @@ htmlcov
 *.key
 arkindex/config.yml
 test-report.xml
+docker/ssl/*.pem
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -58,7 +58,7 @@ backend-tests:
        - test-report.xml

  script:
-    - arkindex/manage.py test
+    - arkindex test

 backend-lint:
  image: python:3.10
@@ -91,7 +91,7 @@ backend-migrations:
      alias: postgres

  script:
-    - arkindex/manage.py makemigrations --check --noinput --dry-run -v 3
+    - arkindex makemigrations --check --noinput --dry-run -v 3

 backend-openapi:
  extends: .backend-setup

--- a/Dockerfile
+++ b/Dockerfile
@@ -19,10 +19,13 @@ RUN chown -R ark:teklia /backend_static
 # Copy Version file
 COPY VERSION /etc/arkindex.version

-HEALTHCHECK --start-period=1m --interval=1m --timeout=5s \
-  CMD wget --spider --quiet http://localhost/api/v1/public-key/ || exit 1
+ENV PORT 8000
+HEALTHCHECK --start-period=10s --interval=30s --timeout=5s \
+  CMD wget --spider --quiet http://localhost:$PORT/api/v1/corpus/ || exit 1
+
+# Allow usage of django-admin by exposing our settings
+ENV DJANGO_SETTINGS_MODULE "arkindex.project.settings"

 # Run with Gunicorn
-ENV PORT 8000
 EXPOSE $PORT
-CMD manage.py gunicorn --host=0.0.0.0 --port $PORT
+CMD arkindex gunicorn --host=0.0.0.0 --port $PORT
--- a/Makefile
+++ b/Makefile
 ROOT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 IMAGE_TAG=registry.gitlab.teklia.com/arkindex/backend

-.PHONY: all release
+.PHONY: all release services

 all: clean build

@@ -16,33 +16,33 @@ build:
 	CI_PROJECT_DIR=$(ROOT_DIR) CI_REGISTRY_IMAGE=$(IMAGE_TAG) $(ROOT_DIR)/ci/build.sh

 worker:
-	arkindex/manage.py rqworker -v 2 default high tasks
+	arkindex rqworker -v 2 default high tasks

 test-fixtures:
 	$(eval export PGPASSWORD=devdata)
-	psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_dev RENAME TO arkindex_tmp_fixtures' template1
-	psql -h 127.0.0.1 -p 9100 -U devuser -c 'CREATE DATABASE arkindex_dev' template1
+	psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_dev RENAME TO arkindex_tmp_fixtures' template1
+	psql -h 127.0.0.1 -p 5432 -U devuser -c 'CREATE DATABASE arkindex_dev' template1
 	# A "try...finally" block in a Makefile: ensure we bring back the dev database even when test-fixtures fails
 	-$(MAKE) test-fixtures-run
 	$(MAKE) test-fixtures-restore

 test-fixtures-run:
-	arkindex/manage.py migrate
-	arkindex/manage.py build_fixtures
-	arkindex/manage.py dumpdata --indent 4 process documents images users auth ponos training > arkindex/documents/fixtures/data.json
+	arkindex migrate
+	arkindex build_fixtures
+	arkindex dumpdata --indent 4 process documents images users auth ponos training > arkindex/documents/fixtures/data.json

 test-fixtures-restore:
 	# This first renaming ensures that arkindex_tmp_fixtures exists; we don't want to drop arkindex_dev without a backup
-	psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_tmp_fixtures RENAME TO arkindex_dev_replace' template1
-	psql -h 127.0.0.1 -p 9100 -U devuser -c 'DROP DATABASE arkindex_dev' template1
-	psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_dev_replace RENAME TO arkindex_dev' template1
+	psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_tmp_fixtures RENAME TO arkindex_dev_replace' template1
+	psql -h 127.0.0.1 -p 5432 -U devuser -c 'DROP DATABASE arkindex_dev' template1
+	psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_dev_replace RENAME TO arkindex_dev' template1

 require-version:
 	@if [ ! "$(version)" ]; then echo "Missing version to publish"; exit 1; fi
 	@git rev-parse $(version) >/dev/null 2>&1 && (echo "Version $(version) already exists on local git repo !" && exit 1) || true

 schema:
-	./arkindex/manage.py spectacular --fail-on-warn --validate --file schema.yml
+	arkindex spectacular --fail-on-warn --validate --file schema.yml

 release:
 	$(eval version:=$(shell cat VERSION))
@@ -50,3 +50,21 @@ release:
 	git commit VERSION -m "Version $(version)"
 	git tag $(version)
 	git push origin master $(version)
+
+clean-docker:
+	$(eval containers:=$(shell docker ps -a -q))
+	@if [ -n "$(containers)" ]; then \
+		echo "Cleaning up past containers\n" \
+		docker rm -f $(containers) ; \
+	fi
+
+stack: docker/ssl/ark-cert.pem
+	docker compose -p arkindex up --build
+
+services: docker/ssl/ark-cert.pem
+	docker compose -p arkindex -f docker/docker-compose.services.yml up
+
+docker/ssl/ark-cert.pem:
+	$(eval export CAROOT=$(ROOT_DIR)/docker/ssl)
+	mkcert -install
+	mkcert -cert-file=$(ROOT_DIR)/docker/ssl/ark-cert.pem -key-file=$(ROOT_DIR)/docker/ssl/ark-key.pem ark.localhost *.ark.localhost *.iiif.ark.localhost
--- a/README.md
+++ b/README.md
@@ -6,43 +6,83 @@ Backend for Historical Manuscripts Indexing

 ## Requirements

-* Clone of the [architecture](https://gitlab.teklia.com/arkindex/architecture)
 * Git
 * Make
-* Python 3.6+
+* Python 3.10+
 * pip
 * [virtualenvwrapper](https://virtualenvwrapper.readthedocs.io/en/latest/)
+* [Docker 24+](https://docs.docker.com/engine/install/#supported-platforms)
+* [mkcert](https://github.com/FiloSottile/mkcert?tab=readme-ov-file#installation)
+* [GeoDjango system dependencies](https://docs.djangoproject.com/en/3.1/ref/contrib/gis/install/geolibs/): `sudo apt install binutils libproj-dev gdal-bin`

-## Dev Setup
+## Setup for developers

-```
+You'll also need the [Arkindex frontend](https://gitlab.teklia.com/arkindex/frontend) to be able to develop on the whole platform.
+
+```console
 git clone git@gitlab.teklia.com:arkindex/backend.git
+git clone git@gitlab.teklia.com:arkindex/frontend.git
 cd backend
-mkvirtualenv ark -a .
+mkvirtualenv ark -a . -p /usr/bin/python3.10
 pip install -e .[test]
 ```

-When the [architecture](https://gitlab.teklia.com/arkindex/architecture) is running locally to provide required services:
+The Arkindex backend relies on some open-source services to store data and communicate to asynchronous workers.
+To run all the required services, please run in a dedicated shell:

+```console
+make services
 ```
-arkindex/manage.py migrate
-arkindex/manage.py createsuperuser
+
+On a first run, you'll need to:
+
+1. Configure the instance by enabling the sample configuration.
+2. Populate the database structure.
+3. Initialize some fields in the database.
+4. Create an administration account.
+
+All of these steps are done through:
+
+```console
+cp config.yml.sample arkindex/config.yml
+arkindex migrate
+arkindex bootstrap
+arkindex createsuperuser
 ```

-### Local configuration
+Finally, you can run the backend:
+
+```console
+arkindex runserver
+```
+
+At this stage, you can use `http://localhost:8000/admin` to access the administration interface.
+
+### Asycnhronous tasks
+
+To run asynchronous tasks, run in another shell:
+
+```console
+make worker
+```
+
+### Dockerized stack

-For development purposes, you can customize the Arkindex settings by adding a YAML file as `arkindex/config.yml`. This file is not tracked by Git; if it exists, any configuration directive set in this file will be used for exposed settings from `settings.py`. You can view the full list of settings [on the wiki](https://wiki.vpn/en/arkindex/deploy/configuration).
+It is also possible to run the whole Arkindex stack through Docker containers. This is useful to quickly test the platform.

+This command will build all the required Docker images (backend & frontend) and run them as Docker containers:

-Another mean to customize your Arkindex instance is to add a Python file in `arkindex/project/local_settings.py`. Here you are not limited to exposed settings, and can customize any setting, or even load Python dependencies at boot time. This is not recommended, as your customization may not be available to real-world Arkindex instances.
+```console
+make stack
+```

-### ImageMagick setup
+You'll be able to access the platform at the url `https://ark.localhost`.

-PDF and image imports in Arkindex will require ImageMagick. Due to its ability to take any computer down if you give it the right parameters (for example, converting a 1000-page PDF file into JPEG files at 30 000 DPI), it has a security policy file. By default, on Ubuntu, PDF conversion is forbidden.
+### Local configuration

-You will need to edit the ImageMagick policy file to get PDF and Image imports to work in Arkindex. The file is located at `/etc/ImageMagick-6/policy.xml`.
+For development purposes, you can customize the Arkindex settings by adding a YAML file as `arkindex/config.yml`. This file is not tracked by Git; if it exists, any configuration directive set in this file will be used for exposed settings from `settings.py`. You can view the full list of settings [on the wiki](https://redmine.teklia.com/projects/arkindex/wiki/Backend_configuration).

-The line that sets the PDF policy is `<policy domain="coder" rights="none" pattern="PDF" />`. Replace `none` with `read|write` for it to work. See [this StackOverflow question](https://stackoverflow.com/questions/52998331) for more info.
+Another way to customize your Arkindex instance is to add a Python file in `arkindex/project/local_settings.py`. Here you are not limited to exposed settings, and can customize any setting, or even load Python dependencies at boot time. This is not recommended, as your customization may not be available to real-world Arkindex instances.

 ### Local image server

@@ -54,19 +94,14 @@ local_imageserver_id: 999

 Here is how to quickly create the ImageServer using the shell:

-```
-backend/arkindex$ ./manage.py shell
+```python
+$ arkindex shell
 >>> from arkindex.images.models import ImageServer
 >>> ImageServer.objects.create(id=1, display_name='local', url='https://ark.localhost/iiif')
 ```

 Note that this local server will only work inside Docker.

-### User groups
-
-We use a custom group model in `arkindex.users.models` (not the `django.contrib.auth` one).
-In this early version groups do not define any right yet.
-
 ## Usage

 ### Makefile
@@ -76,31 +111,28 @@ At the root of the repository is a Makefile that provides commands for common op
 * `make` or `make all`: Clean and build;
 * `make base`: Create and push the `arkindex-base` Docker image that is used to build the `arkindex-app` image;
 * `make clean`: Cleanup the Python package build and cache files;
+* `make clean-docker`: Deletes all running containers to avoid naming and network ports conflicts;
 * `make build`: Build the arkindex Python package and recreate the `arkindex-app:latest` without pushing to the GitLab container registry;
 * `make test-fixtures`: Create the unit tests fixtures on a temporary PostgreSQL database and save them to the `data.json` file used by most Django unit tests.

 ### Django commands

-Aside from the usual Django commands, some custom commands are available via `manage.py`:
+Aside from the usual Django commands, some custom commands are available via `arkindex`:

-* `build_fixtures`: Create a set of database elements designed for use by unit tests in a fixture (see `make test-fixtures`);
-* `from_csv`: Import manifests and index files from a CSV list;
-* `import_annotations`: Import index files from a folder into a specific volume;
-* `import_acts`: Import XML surface files and CSV act files;
-* `delete_corpus`: Delete a big corpus using an RQ task;
-* `reindex`: Reindex elements into Solr;
-* `telegraf`: A special command with InfluxDB-compatible output for Grafana statistics.
-* `move_lines_to_parents`: Moves element children to their geographical parents;
+* `build_fixtures`: Create a set of database elements designed for use by unit tests in a fixture (see `make test-fixtures`).
+* `delete_corpus`: Delete a big corpus using an RQ task.
+* `reindex`: Reindex elements into Solr.
+* `move_lines_to_parents`: Moves element children to their geographical parents.

-See `manage.py <command> --help` to view more details about a specific command.
+See `arkindex <command> --help` to view more details about a specific command.

 ## Code validation

 Once your code appears to be working on a local server, a few checks have to be performed:

-* **Migrations:** Ensure that all migrations have been created by typing `./manage.py makemigrations`.
-* **Unit tests:** Run `./manage.py test` to perform unit tests.
-   - Use `./manage.py test module_name` to perform tests on a single module, if you wish to spend less time waiting for all tests to complete.
+* **Migrations:** Ensure that all migrations have been created by typing `arkindex makemigrations`.
+* **Unit tests:** Run `arkindex test` to perform unit tests.
+   - Use `arkindex test module_name` to perform tests on a single module, if you wish to spend less time waiting for all tests to complete.

 ### Linting

@@ -108,9 +140,9 @@ We use [pre-commit](https://pre-commit.com/) to check the Python source code syn

 To be efficient, you should run pre-commit before committing (hence the name...).

-To do that, run once :
+To do that, run once:

-```
+```console
 pip install pre-commit
 pre-commit install
 ```
@@ -123,13 +155,13 @@ If you want to run the full workflow on all the files: `pre-commit run -a`.

 Run `pip install ipython django-debug-toolbar django_extensions` to install all the available optional dev tools for the backend.

-IPython will give you a nicer shell with syntax highlighting, auto reloading and much more via `./manage.py shell`.
+IPython will give you a nicer shell with syntax highlighting, auto reloading and much more via `arkindex shell`.

 [Django Debug Toolbar](https://django-debug-toolbar.readthedocs.io/en/latest/) provides you with a neat debug sidebar that will help diagnosing slow API endpoints or weird template bugs. Since the Arkindex frontend is completely decoupled from the backend, you will need to browse to an API endpoint to see the debug toolbar.

-[Django Extensions](https://django-extensions.readthedocs.io/en/latest/) adds a *lot* of `manage.py` commands ; the most important one is `./manage.py shell_plus` which runs the usual shell but with all the available models pre-imported. You can add your own imports with the `local_settings.py` file. Here is an example that imports most of the backend's enums and some special QuerySet features:
+[Django Extensions](https://django-extensions.readthedocs.io/en/latest/) adds a *lot* of `arkindex` commands ; the most important one is `arkindex shell_plus` which runs the usual shell but with all the available models pre-imported. You can add your own imports with the `local_settings.py` file. Here is an example that imports some of the backend's enums and some special QuerySet features:

-``` python
+```python
 SHELL_PLUS_POST_IMPORTS = [
    ('django.db.models', ('Value', )),
    ('django.db.models.functions', '*'),
@@ -138,7 +170,7 @@ SHELL_PLUS_POST_IMPORTS = [
        'Right',
    )),
    ('arkindex.process.models', (
-        'DataImportMode',
+        'ProcessMode',
    )),
    ('arkindex.project.aws', (
        'S3FileStatus',
@@ -148,23 +180,33 @@ SHELL_PLUS_POST_IMPORTS = [

 ## Asynchronous tasks

-We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org/project/django-rq/), to run tasks without blocking an API request or causing timeouts. To call them in Python code, you should use the trigger methods in `arkindex.project.triggers`; those will do some safety checks to make catching some errors easier in dev. The actual tasks are in `arkindex.documents.tasks`. The following tasks exist:
+We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org/project/django-rq/), to run tasks without blocking an API request or causing timeouts. To call them in Python code, you should use the trigger methods in `arkindex.project.triggers`; those will do some safety checks to make catching some errors easier in dev. The actual tasks are in `arkindex.documents.tasks`, or in other `tasks` modules within each Django app. The following tasks exist:

 * Delete a corpus: `corpus_delete`
 * Delete a list of elements: `element_trash`
 * Delete worker results (transcriptions, classifications, etc. of a worker version): `worker_results_delete`
 * Move an element to another parent: `move_element`
-* Create `WorkerActivity` instances for all elements of a process: `intitialize_activity`
+* Create `WorkerActivity` instances for all elements of a process: `initialize_activity`
 * Delete a process and its worker activities: `process_delete`
 * Export a corpus to an SQLite database: `export_corpus`

-To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make slim` or `make` in the architecture will provide it. `make` in the architecture also provides a RQ worker running in Docker from a binary build.
-
-Process tasks are run in RQ by default (Community Edition). Two RQ workers must be running at the same time to actually run a process with worker activities, so the initialisation task can wait for the worker activity task to finish:
-```sh
-$ manage.py rqworker -v 3 default high & manage.py rqworker -v 3 tasks
-```
+To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make services` will provide it. `make stack` also provides an RQ worker running in Docker from a binary build.

 ## Metrics
+
 The application serves metrics for Prometheus under the `/metrics` prefix.
 A specific port can be used by setting the `PROMETHEUS_METRICS_PORT` environment variable, thus separating the application from the metrics API.
+
+## Migration from `architecture` setup
+
+If you were using the `architecture` repository previously to run Arkindex, you'll need to migrate MinIO data from a static path on your computer towards a new docker volume.
+
+```console
+docker volume create arkindex_miniodata
+mv /usr/share/arkindex/s3/data/iiif /var/lib/docker/volumes/arkindex_miniodata/_data/uploads
+mv /usr/share/arkindex/s3/data/{export,iiif-cache,ponos-logs,ponos-artifacts,staging,thumbnails,training} /var/lib/docker/volumes/arkindex_miniodata/_data/
+```
+
+You will also need to setup [mkcert](https://github.com/FiloSottile/mkcert?tab=readme-ov-file#installation) as we do not use Teklia development Certificate Authority anymore. `mkcert` will take care of SSL certificates automatically, updating your browsers and system certificate store !
+
+Finally, you can remove the `architecture` project from your work folder, as it's now archived and could be confusing.
--- a/VERSION
+++ b/VERSION
-1.5.4
+1.6.0-beta2
--- a/arkindex/documents/api/elements.py
+++ b/arkindex/documents/api/elements.py
@@ -67,7 +67,7 @@ from arkindex.documents.serializers.elements import (
    ElementNeighborsSerializer,
    ElementParentSerializer,
    ElementSerializer,
-    ElementSlimSerializer,
+    ElementTinySerializer,
    ElementTypeSerializer,
    MetaDataBulkSerializer,
    MetaDataCreateSerializer,
@@ -1410,7 +1410,7 @@ class ManageSelection(SelectionMixin, ListAPIView):
    @extend_schema(
        operation_id="AddSelection",
        description="Add specific elements",
-        responses={201: ElementSlimSerializer},
+        responses={201: ElementTinySerializer},
        request=inline_serializer(
            name="AddSelectionBodySerializer",
            fields={"ids": serializers.ListField(child=serializers.UUIDField())}
@@ -1450,7 +1450,7 @@ class ManageSelection(SelectionMixin, ListAPIView):
        prefetch_related_objects(elements, "corpus", "image__server", "type")
        return Response(
            status=status.HTTP_201_CREATED,
-            data=ElementSlimSerializer(
+            data=ElementTinySerializer(
                elements,
                context={"request": request},
                many=True

--- a/arkindex/documents/api/entities.py
+++ b/arkindex/documents/api/entities.py
@@ -21,7 +21,7 @@ from arkindex.documents.models import (
    Transcription,
    TranscriptionEntity,
 )
-from arkindex.documents.serializers.elements import ElementSlimSerializer
+from arkindex.documents.serializers.elements import ElementTinySerializer
 from arkindex.documents.serializers.entities import (
    BaseEntitySerializer,
    CreateEntityRoleErrorResponseSerializer,
@@ -218,7 +218,7 @@ class EntityElements(ListAPIView):
    """
    Get all elements that have a link with the entity
    """
-    serializer_class = ElementSlimSerializer
+    serializer_class = ElementTinySerializer
    # For OpenAPI type discovery: an entity's ID is in the path
    queryset = Entity.objects.none()


--- a/arkindex/documents/api/export.py
+++ b/arkindex/documents/api/export.py
@@ -2,7 +2,9 @@ from datetime import timedelta
 from textwrap import dedent

 from django.conf import settings
+from django.shortcuts import get_object_or_404
 from django.utils import timezone
+from django.utils.functional import cached_property
 from drf_spectacular.utils import extend_schema, extend_schema_view
 from rest_framework import permissions, serializers, status
 from rest_framework.exceptions import PermissionDenied, ValidationError
@@ -11,9 +13,7 @@ from rest_framework.response import Response

 from arkindex.documents.models import Corpus, CorpusExport, CorpusExportState
 from arkindex.documents.serializers.export import CorpusExportSerializer
-from arkindex.project.mixins import CorpusACLMixin
 from arkindex.project.permissions import IsVerified
-from arkindex.users.models import Role


 @extend_schema(tags=["exports"])
@@ -27,47 +27,42 @@ from arkindex.users.models import Role
    ),
    post=extend_schema(
        operation_id="StartExport",
-        request=None,
        description=dedent(
            f"""
            Start a corpus export job.

            A user must wait for {settings.EXPORT_TTL_SECONDS} seconds after the last successful import
-            before being able to generate a new export of the same corpus.
+            before being able to generate a new export of the same corpus from the same source.

            Contributor access is required.
            """
        ),
    )
 )
-class CorpusExportAPIView(CorpusACLMixin, ListCreateAPIView):
+class CorpusExportAPIView(ListCreateAPIView):
    permission_classes = (IsVerified, )
    serializer_class = CorpusExportSerializer
    queryset = CorpusExport.objects.none()

+    @cached_property
+    def corpus(self):
+        qs = Corpus.objects.readable(self.request.user)
+        corpus = get_object_or_404(qs, pk=self.kwargs["pk"])
+        if self.request.method not in permissions.SAFE_METHODS and not corpus.is_writable(self.request.user):
+            raise PermissionDenied(detail="You do not have write access to this corpus.")
+        return corpus
+
    def get_queryset(self):
        return CorpusExport \
            .objects \
-            .filter(corpus=self.get_corpus(self.kwargs["pk"])) \
+            .filter(corpus=self.corpus) \
            .select_related("user") \
            .order_by("-created")

-    def post(self, *args, **kwargs):
-        corpus = self.get_corpus(self.kwargs["pk"], role=Role.Contributor)
-
-        if corpus.exports.filter(state__in=(CorpusExportState.Created, CorpusExportState.Running)).exists():
-            raise ValidationError("An export is already running for this corpus.")
-
-        available_exports = corpus.exports.filter(
-            state=CorpusExportState.Done,
-            created__gte=timezone.now() - timedelta(seconds=settings.EXPORT_TTL_SECONDS)
-        )
-        if available_exports.exists():
-            raise ValidationError(f"An export has already been made for this corpus in the last {settings.EXPORT_TTL_SECONDS} seconds.")
-
-        export = corpus.exports.create(user=self.request.user)
-        export.start()
-        return Response(CorpusExportSerializer(export).data, status=status.HTTP_201_CREATED)
+    def get_serializer_context(self):
+        context = super().get_serializer_context()
+        context["corpus"] = self.corpus
+        return context


 @extend_schema(

--- a/arkindex/documents/export/__init__.py
+++ b/arkindex/documents/export/__init__.py
@@ -46,12 +46,12 @@ EXPORT_QUERIES = [
 ]


-def run_pg_query(query):
+def run_pg_query(query, source_db):
    """
    Run a single Postgresql query and split the results into chunks.
    When a name is given to a cursor, psycopg2 uses a server-side cursor; we just use a random string as a name.
    """
-    with connections["default"].create_cursor(name=str(uuid.uuid4())) as pg_cursor:
+    with connections[source_db].create_cursor(name=str(uuid.uuid4())) as pg_cursor:
        pg_cursor.itersize = BATCH_SIZE
        pg_cursor.execute(query)

@@ -122,7 +122,11 @@ def export_corpus(corpus_export: CorpusExport) -> None:
        corpus_export.state = CorpusExportState.Running
        corpus_export.save()

-        logger.info(f"Exporting corpus {corpus_export.corpus_id} into {db_path}")
+        export_source = f"{corpus_export.corpus_id}"
+        if corpus_export.source != "default":
+            export_source += f" from source {corpus_export.source}"
+
+        logger.info(f"Exporting corpus {export_source} into {db_path}")
        db = sqlite3.connect(db_path)
        cursor = db.cursor()

@@ -135,7 +139,7 @@ def export_corpus(corpus_export: CorpusExport) -> None:

            if rq_job:
                rq_job.set_progress(i / (len(EXPORT_QUERIES) + 1))
-            for chunk in run_pg_query(query.format(corpus_id=corpus_export.corpus_id)):
+            for chunk in run_pg_query(query.format(corpus_id=corpus_export.corpus_id), corpus_export.source):
                save_sqlite(chunk, name, cursor)

            db.commit()

--- a/arkindex/documents/fixtures/data.json
+++ b/arkindex/documents/fixtures/data.json
--- a/arkindex/documents/management/commands/bootstrap.py
+++ b/arkindex/documents/management/commands/bootstrap.py
@@ -14,9 +14,14 @@ from arkindex.process.models import FeatureUsage, Repository, Worker, WorkerType
 from arkindex.users.models import User

 # Constants used in architecture project
-IMAGE_SERVER_ID = 12345
-IMAGE_SERVER_BUCKET = "iiif"
-IMAGE_SERVER_REGION = "local"
+UPLOADS_IMAGE_SERVER_ID = 12345
+UPLOADS_IMAGE_SERVER_URL = "https://uploads.iiif.ark.localhost/iiif/2"
+UPLOADS_IMAGE_SERVER_BUCKET = "uploads"
+UPLOADS_IMAGE_SERVER_REGION = "local"
+INGEST_IMAGE_SERVER_ID = 67890
+INGEST_IMAGE_SERVER_URL = "https://ingest.iiif.ark.localhost/iiif/2"
+INGEST_IMAGE_SERVER_BUCKET = "ingest"
+INGEST_IMAGE_SERVER_REGION = "local"
 PONOS_FARM_ID = "001e411a-1111-2222-3333-444455556666"
 PONOS_FARM_NAME = "Bootstrap farm"
 PONOS_FARM_SEED = "b12868101dab84984481741663d809d2393784894d6e807ceee0bd95051bf971"
@@ -52,6 +57,46 @@ class Command(BaseCommand):
            user.save()
            self.warn(f"Updated user {user} to admin")

+    def create_image_server(self, id, url, bucket, region, display_name):
+        try:
+            server = ImageServer.objects.get(Q(id=id) | Q(url=url))
+            if server.id != id:
+                # Migrate existing images & server id in a single transaction
+                with transaction.atomic():
+                    server.images.update(server_id=id)
+                    ImageServer.objects.filter(id=server.id).update(id=id)
+                self.warn(f"Image server {server.id} updated to {id}")
+
+                # Update internal reference for updates below
+                server.id = id
+
+            if server.url != url:
+                server.url = url
+                server.save()
+
+            # Update base settings
+            if server.s3_bucket != bucket or server.s3_region != region:
+                server.s3_bucket = bucket
+                server.s3_region = region
+                server.save()
+                self.warn(f"Updated image server {server.id} S3 settings")
+            else:
+                self.success(f"Image server {server.id} valid")
+        except ImageServer.DoesNotExist:
+            try:
+                server = ImageServer.objects.create(
+                    id=id,
+                    url=url,
+                    s3_bucket=bucket,
+                    s3_region=region,
+                    display_name=display_name,
+                )
+                self.success(f"Image server {server.id} created")
+            except IntegrityError as e:
+                self.fail(f"Failed to create image server: {e}")
+                return
+        return server
+
    def handle(self, **options):
        # Never allow running this script in production
        if not settings.DEBUG:
@@ -108,47 +153,18 @@ class Command(BaseCommand):
            self.success(f"Created token {ADMIN_API_TOKEN}")

        # an image server for local cantaloupe https://ark.localhost/iiif/2
-        try:
-            server = ImageServer.objects.get(url="https://ark.localhost/iiif/2")
-            if server.id != IMAGE_SERVER_ID:
-                # Migrate existing images & server id in a single transaction
-                with transaction.atomic():
-                    server.images.update(server_id=IMAGE_SERVER_ID)
-                    ImageServer.objects.filter(id=server.id).update(id=IMAGE_SERVER_ID)
-                self.warn(f"Image server {server.id} updated to {IMAGE_SERVER_ID}")
-
-                # Update internal reference for updates below
-                server.id = IMAGE_SERVER_ID
-
-            # Update base settings
-            if server.s3_bucket != IMAGE_SERVER_BUCKET or server.s3_region != IMAGE_SERVER_REGION:
-                server.s3_bucket = IMAGE_SERVER_BUCKET
-                server.s3_region = IMAGE_SERVER_REGION
-                server.save()
-                self.warn("Updated image server S3 settings")
-            else:
-                self.success(f"Image server {server.id} valid")
-        except ImageServer.DoesNotExist:
-            try:
-                server = ImageServer.objects.create(
-                    id=IMAGE_SERVER_ID,
-                    url="https://ark.localhost/iiif/2",
-                    s3_bucket=IMAGE_SERVER_BUCKET,
-                    s3_region=IMAGE_SERVER_REGION,
-                    display_name="Development local server",
-                )
-                self.success("Image server created")
-            except IntegrityError as e:
-                self.fail(f"Failed to create image server: {e}")
-                return
+        uploads_server = self.create_image_server(UPLOADS_IMAGE_SERVER_ID , UPLOADS_IMAGE_SERVER_URL, UPLOADS_IMAGE_SERVER_BUCKET , UPLOADS_IMAGE_SERVER_REGION , "Local IIIF server for user uploaded files through frontend")
+        if uploads_server is None:
+            return
+        self.create_image_server(INGEST_IMAGE_SERVER_ID , INGEST_IMAGE_SERVER_URL, INGEST_IMAGE_SERVER_BUCKET , INGEST_IMAGE_SERVER_REGION , "Local IIIF server for ingested files from minio")

        # Check there is not already a local server with invalid path
        # We'll merge its image into the new one
        # This bad server may have been created by automatic IIIF server detection
        try:
-            bad_server = ImageServer.objects.get(url="https://ark.localhost/iiif")
-            bad_server.merge_into(server)
-            self.warn(f"Merged images from {bad_server.id} into {server.id}")
+            bad_server = ImageServer.objects.get(url="https://uploads.iiif.ark.localhost/iiif")
+            bad_server.merge_into(uploads_server)
+            self.warn(f"Merged images from {bad_server.id} into {uploads_server.id}")

            bad_server.delete()
            self.warn("Deleted old server")
@@ -194,17 +210,21 @@ class Command(BaseCommand):
            )
            self.success(f"Created revision {revision.hash}")

-        version, created = worker.versions.get_or_create(
-            revision=revision,
-            defaults={
-                "id": IMPORT_WORKER_VERSION_ID,
-                "configuration": {},
-                "state": WorkerVersionState.Created,
-                "gpu_usage": FeatureUsage.Disabled,
-                "docker_image": None,
-                "docker_image_iid": None,
-            }
-        )
+        try:
+            version = WorkerVersion.objects.get(id=IMPORT_WORKER_VERSION_ID)
+            created = False
+        except WorkerVersion.DoesNotExist:
+            version, created = worker.versions.get_or_create(
+                revision=revision,
+                defaults={
+                    "id": IMPORT_WORKER_VERSION_ID,
+                    "configuration": {},
+                    "state": WorkerVersionState.Created,
+                    "gpu_usage": FeatureUsage.Disabled,
+                    "docker_image": None,
+                    "docker_image_iid": None,
+                }
+            )
        if created:
            self.success(f"Created worker version {version.slug}")
        else:

--- a/arkindex/documents/management/commands/build_fixtures.py
+++ b/arkindex/documents/management/commands/build_fixtures.py
 #!/usr/bin/env python3
-from datetime import datetime, timezone
 from unittest.mock import patch

 from django.contrib.gis.geos import LinearRing
@@ -8,7 +7,7 @@ from django.utils import timezone as DjangoTimeZone

 from arkindex.documents.models import Corpus, Element, MetaData, MetaType
 from arkindex.images.models import Image, ImageServer
-from arkindex.ponos.models import Farm, State
+from arkindex.ponos.models import Farm
 from arkindex.process.models import (
    FeatureUsage,
    Process,
@@ -104,23 +103,7 @@ class Command(BaseCommand):
        farm = Farm.objects.create(name="Wheat farm")
        farm.memberships.create(user=user, level=Role.Guest.value)

-        # Create a fake docker build with a docker image task
-        build_process = Process.objects.create(
-            farm=farm,
-            creator=superuser,
-            mode=ProcessMode.Repository,
-        )
-        build_task = build_process.tasks.create(
-            run=0,
-            depth=0,
-            slug="docker_build",
-            state=State.Completed,
-            # Use an expiry very far away so that task is never expired
-            expiry=datetime(2100, 12, 31, 23, 59, 59, 999999, timezone.utc),
-        )
-        docker_image = build_task.artifacts.create(size=42_000, path="/path/to/docker_build")
-
-        # Create some workers for the repository with their available version
+        # Create some workers with available versions
        recognizer_worker = WorkerVersion.objects.create(
            worker=worker_repo.workers.create(
                name="Recognizer",
@@ -131,7 +114,7 @@ class Command(BaseCommand):
            configuration={"test": 42},
            state=WorkerVersionState.Available,
            model_usage=FeatureUsage.Disabled,
-            docker_image=docker_image
+            docker_image_iid="registry.somewhere.com/something:latest"
        )
        dla_worker = WorkerVersion.objects.create(
            worker=worker_repo.workers.create(
@@ -143,7 +126,7 @@ class Command(BaseCommand):
            configuration={"test": 42},
            state=WorkerVersionState.Available,
            model_usage=FeatureUsage.Disabled,
-            docker_image=docker_image
+            docker_image_iid="registry.somewhere.com/something:latest"
        )

        WorkerVersion.objects.create(
@@ -156,7 +139,7 @@ class Command(BaseCommand):
            configuration={},
            state=WorkerVersionState.Available,
            model_usage=FeatureUsage.Disabled,
-            docker_image=docker_image,
+            docker_image_iid="registry.somewhere.com/something:latest"
        )

        WorkerVersion.objects.create(
@@ -169,7 +152,7 @@ class Command(BaseCommand):
            configuration={"test": 42},
            state=WorkerVersionState.Available,
            model_usage=FeatureUsage.Disabled,
-            docker_image=docker_image,
+            docker_image_iid="registry.somewhere.com/something:latest",
            gpu_usage=FeatureUsage.Required
        )

@@ -185,7 +168,7 @@ class Command(BaseCommand):
            state=WorkerVersionState.Available,
            gpu_usage=FeatureUsage.Disabled,
            model_usage=FeatureUsage.Required,
-            docker_image=docker_image
+            docker_image_iid="registry.somewhere.com/something:latest"
        )

        # Create a custom worker version that is not linked to a Git repository/revision

--- a/arkindex/documents/migrations/0006_index_cleanup.py
+++ b/arkindex/documents/migrations/0006_index_cleanup.py
@@ -57,7 +57,7 @@ class Migration(migrations.Migration):
                    ),
                ),
            ],
-            # This can be removed by manage.py squashmigrations
+            # This can be removed by `arkindex squashmigrations`
            elidable=True,
        ),
    ]
--- a/arkindex/documents/migrations/0009_corpusexport_source.py
+++ b/arkindex/documents/migrations/0009_corpusexport_source.py
+# Generated by Django 4.1.7 on 2024-02-28 15:56
+
+from django.db import migrations, models
+
+from arkindex.project import settings
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("documents", "0008_alter_elementtype_color_alter_entitytype_color"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="corpusexport",
+            name="source",
+            field=models.CharField(choices=[(source, source) for source in settings.EXPORT_SOURCES], default="default", max_length=50),
+        ),
+    ]
--- a/arkindex/documents/models.py
+++ b/arkindex/documents/models.py
@@ -73,6 +73,18 @@ class Corpus(IndexableModel):
            for values in DEFAULT_CORPUS_TYPES
        )

+    def is_writable(self, user) -> bool:
+        """
+        Whether a user has write access to this corpus
+        """
+        if user.is_anonymous or getattr(user, "is_agent", False):
+            return False
+        if user.is_admin:
+            return True
+        from arkindex.users.utils import get_max_level
+        level = get_max_level(user, self)
+        return level is not None and level >= Role.Contributor.value
+

 class ElementType(models.Model):
    id = models.UUIDField(default=uuid.uuid4, primary_key=True, editable=False)
@@ -1185,6 +1197,7 @@ class CorpusExport(S3FileMixin, IndexableModel):
    corpus = models.ForeignKey(Corpus, related_name="exports", on_delete=models.CASCADE)
    user = models.ForeignKey(settings.AUTH_USER_MODEL, related_name="exports", on_delete=models.CASCADE)
    state = EnumField(CorpusExportState, max_length=10, default=CorpusExportState.Created)
+    source = models.CharField(max_length=50, default="default", choices=[(source, source) for source in settings.EXPORT_SOURCES])

    s3_bucket = settings.AWS_EXPORT_BUCKET


--- a/arkindex/documents/serializers/elements.py
+++ b/arkindex/documents/serializers/elements.py
 import math
 import uuid
 from collections import defaultdict
+from functools import cached_property
 from textwrap import dedent

 from django.conf import settings
@@ -23,7 +24,6 @@ from arkindex.documents.serializers.light import (
 from arkindex.documents.serializers.ml import ClassificationSerializer, WorkerRunSummarySerializer
 from arkindex.images.models import Image
 from arkindex.images.serializers import ZoneSerializer
-from arkindex.ponos.utils import get_process_from_task_auth
 from arkindex.process.models import WorkerVersion
 from arkindex.project.fields import Array
 from arkindex.project.mixins import SelectionMixin
@@ -429,29 +429,6 @@ class ElementTinySerializer(serializers.ModelSerializer):
        )


-class ElementSlimSerializer(ElementTinySerializer):
-    """
-    Fully serialises a document
-    """
-    thumbnail_put_url = serializers.SerializerMethodField(read_only=True)
-
-    @extend_schema_field(serializers.CharField(allow_null=True))
-    def get_thumbnail_put_url(self, element):
-        """
-        Only set the Thumbnail PUT URL for Ponos tasks that
-        are running the thumbnails generation on a folder.
-        """
-        if element.type.folder:
-            process = get_process_from_task_auth(self.context["request"])
-            if process and process.generate_thumbnails:
-                return element.thumbnail.s3_put_url
-
-    class Meta(ElementTinySerializer.Meta):
-        model = Element
-        fields = ElementTinySerializer.Meta.fields + ("thumbnail_put_url",)
-        read_only_fields = ElementTinySerializer.Meta.read_only_fields + ("thumbnail_put_url",)
-
-
 @extend_schema_serializer(deprecate_fields=("worker_version_id", ))
 class ElementListSerializer(ElementTinySerializer):
    created = serializers.DateTimeField(read_only=True)
@@ -555,7 +532,7 @@ class ElementParentSerializer(serializers.Serializer):


 @extend_schema_serializer(deprecate_fields=("worker_version", ))
-class ElementSerializer(ElementSlimSerializer):
+class ElementSerializer(ElementTinySerializer):
    """
    Serialize an element with its metadata and classifications
    """
@@ -591,9 +568,11 @@ class ElementSerializer(ElementSlimSerializer):

    worker_run = WorkerRunSummarySerializer(read_only=True, allow_null=True)

+    thumbnail_put_url = serializers.SerializerMethodField(read_only=True)
+
    class Meta:
        model = Element
-        fields = ElementSlimSerializer.Meta.fields + (
+        fields = ElementTinySerializer.Meta.fields + (
            "created",
            "creator",
            "rights",
@@ -603,32 +582,57 @@ class ElementSerializer(ElementSlimSerializer):
            "polygon",
            "worker_version",
            "confidence",
-            "worker_run"
+            "worker_run",
+            "thumbnail_put_url",
        )
-        read_only_fields = ElementSlimSerializer.Meta.read_only_fields + (
+        read_only_fields = ElementTinySerializer.Meta.read_only_fields + (
            "created",
            "creator",
            "rights",
            "metadata_count",
            "classifications",
            "worker_version",
-            "worker_run"
+            "worker_run",
+            "thumbnail_put_url",
        )

-    @extend_schema_field(serializers.ListField(child=serializers.ChoiceField(["read", "write", "admin"])))
-    def get_rights(self, element):
+    @cached_property
+    def element_rights(self):
+        if not self.instance:
+            return
+
        user = self.context["request"].user
-        level = get_max_level(user, element.corpus)
+        level = get_max_level(user, self.instance.corpus)
+
+        # Admin access is granted to both corpus admins and element creators that are corpus contributors
+        if level >= Role.Contributor.value and self.instance.creator_id == user.id:
+            return Role.Admin.value

+        return level
+
+    @extend_schema_field(serializers.ListField(child=serializers.ChoiceField(["read", "write", "admin"])))
+    def get_rights(self, element):
        rights = ["read"]
-        if level >= Role.Contributor.value:
+        if self.element_rights >= Role.Contributor.value:
            rights.append("write")
-        # Admin access is granted to both corpus admins and element creators
-        if level >= Role.Admin.value or (level >= Role.Contributor.value and element.creator_id == user.id):
+        if self.element_rights >= Role.Admin.value:
            rights.append("admin")

        return rights

+    @extend_schema_field(serializers.CharField(
+        allow_null=True,
+        help_text=dedent("""
+            URL where a PUT request may be sent to upload a new thumbnail for this element.
+
+            Only available on folder elements.
+            Requires **admin** access to the corpus, or **contributor** access to the corpus and to be the element's creator.
+        """),
+    ))
+    def get_thumbnail_put_url(self, element):
+        if element.type.folder and self.element_rights >= Role.Admin.value:
+            return element.thumbnail.s3_put_url
+
    def update(self, instance, validated_data):
        image = validated_data.pop("image", None)
        polygon = validated_data.pop("polygon", None)

--- a/arkindex/documents/serializers/export.py
+++ b/arkindex/documents/serializers/export.py
+from datetime import timedelta
+
+from django.conf import settings
+from django.utils import timezone
 from rest_framework import serializers
+from rest_framework.exceptions import ValidationError

 from arkindex.documents.models import CorpusExport, CorpusExportState
 from arkindex.project.serializer_fields import EnumField
@@ -6,9 +11,38 @@ from arkindex.users.serializers import SimpleUserSerializer


 class CorpusExportSerializer(serializers.ModelSerializer):
-    user = SimpleUserSerializer()
-    state = EnumField(CorpusExportState)
+    user = SimpleUserSerializer(read_only=True)
+    state = EnumField(CorpusExportState, read_only=True)

    class Meta:
        model = CorpusExport
-        fields = ("id", "created", "updated", "corpus_id", "user", "state")
+        fields = ("id", "created", "updated", "corpus_id", "user", "state", "source",)
+
+    def validate(self, data):
+        corpus = self.context["corpus"]
+        source = data.get("source", "default")
+        # Check that there is no export already running for this corpus
+        if corpus.exports.filter(state__in=(CorpusExportState.Created, CorpusExportState.Running)).exists():
+            raise ValidationError("An export is already running for this corpus.")
+        # Check that there is no available completed export from the same source created less than {EXPORT_TTL_SECONDS}
+        # ago for this corpus
+        available_exports = corpus.exports.filter(
+            state=CorpusExportState.Done,
+            source=source,
+            created__gte=timezone.now() - timedelta(seconds=settings.EXPORT_TTL_SECONDS)
+        )
+        if available_exports.exists():
+            raise ValidationError(f"An export has already been made for this corpus in the last {settings.EXPORT_TTL_SECONDS} seconds.")
+
+        data["corpus"] = corpus
+        data["source"] = source
+        return data
+
+    def create(self, validated_data):
+        export = CorpusExport.objects.create(
+            user=self.context["request"].user,
+            corpus=validated_data["corpus"],
+            source=validated_data["source"]
+        )
+        export.start()
+        return export
--- a/arkindex/documents/tests/tasks/test_corpus_delete.py
+++ b/arkindex/documents/tests/tasks/test_corpus_delete.py
@@ -3,7 +3,7 @@ from django.db.models.signals import pre_delete
 from arkindex.documents.models import Corpus, Element, EntityType, MetaType, Transcription
 from arkindex.documents.tasks import corpus_delete
 from arkindex.ponos.models import Farm, State, Task
-from arkindex.process.models import CorpusWorkerVersion, ProcessDataset, ProcessMode, Repository, WorkerVersion
+from arkindex.process.models import CorpusWorkerVersion, Process, ProcessDataset, ProcessMode, Repository, WorkerVersion
 from arkindex.project.tests import FixtureTestCase, force_constraints_immediate
 from arkindex.training.models import Dataset

@@ -132,7 +132,7 @@ class TestDeleteCorpus(FixtureTestCase):
            message="oh",
            author="me",
        )
-        cls.process = cls.rev.processes.create(
+        cls.process = Process.objects.create(
            creator=cls.user,
            corpus=cls.corpus2,
            mode=ProcessMode.Files,
@@ -204,7 +204,6 @@ class TestDeleteCorpus(FixtureTestCase):
        self.dataset_process3.refresh_from_db()

        self.assertTrue(self.repo.revisions.filter(id=self.rev.id).exists())
-        self.assertEqual(self.process.revision, self.rev)
        self.assertEqual(self.process.files.get(), self.df)
        self.assertTrue(Element.objects.get_descending(self.vol.id).filter(id=self.page.id).exists())
        self.assertTrue(self.corpus2.datasets.filter(id=self.dataset2.id).exists())
No results found