Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • arkindex/backend
1 result
Show changes
Commits on Source (27)
Showing
with 1632 additions and 835 deletions
......@@ -2,6 +2,9 @@
.git
.eggs
*.egg
logs
**/__pycache__/
**/*.pyc
docker/
Makefile
test-report.xml
arkindex/config.yml
......@@ -16,3 +16,4 @@ htmlcov
*.key
arkindex/config.yml
test-report.xml
docker/ssl/*.pem
This diff is collapsed.
include VERSION
include LICENSE
include requirements.txt
include base/requirements.txt
include tests-requirements.txt
......
ROOT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
IMAGE_TAG=registry.gitlab.teklia.com/arkindex/backend
.PHONY: all release
.PHONY: all release services
all: clean build
......@@ -20,8 +20,8 @@ worker:
test-fixtures:
$(eval export PGPASSWORD=devdata)
psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_dev RENAME TO arkindex_tmp_fixtures' template1
psql -h 127.0.0.1 -p 9100 -U devuser -c 'CREATE DATABASE arkindex_dev' template1
psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_dev RENAME TO arkindex_tmp_fixtures' template1
psql -h 127.0.0.1 -p 5432 -U devuser -c 'CREATE DATABASE arkindex_dev' template1
# A "try...finally" block in a Makefile: ensure we bring back the dev database even when test-fixtures fails
-$(MAKE) test-fixtures-run
$(MAKE) test-fixtures-restore
......@@ -33,9 +33,9 @@ test-fixtures-run:
test-fixtures-restore:
# This first renaming ensures that arkindex_tmp_fixtures exists; we don't want to drop arkindex_dev without a backup
psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_tmp_fixtures RENAME TO arkindex_dev_replace' template1
psql -h 127.0.0.1 -p 9100 -U devuser -c 'DROP DATABASE arkindex_dev' template1
psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_dev_replace RENAME TO arkindex_dev' template1
psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_tmp_fixtures RENAME TO arkindex_dev_replace' template1
psql -h 127.0.0.1 -p 5432 -U devuser -c 'DROP DATABASE arkindex_dev' template1
psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_dev_replace RENAME TO arkindex_dev' template1
require-version:
@if [ ! "$(version)" ]; then echo "Missing version to publish"; exit 1; fi
......@@ -50,3 +50,21 @@ release:
git commit VERSION -m "Version $(version)"
git tag $(version)
git push origin master $(version)
clean-docker:
$(eval containers:=$(shell docker ps -a -q))
@if [ -n "$(containers)" ]; then \
echo "Cleaning up past containers\n" \
docker rm -f $(containers) ; \
fi
stack: docker/ssl/ark-cert.pem
docker compose -p arkindex up --build
services: docker/ssl/ark-cert.pem
docker compose -p arkindex -f docker/docker-compose.services.yml up
docker/ssl/ark-cert.pem:
$(eval export CAROOT=$(ROOT_DIR)/docker/ssl)
mkcert -install
mkcert -cert-file=$(ROOT_DIR)/docker/ssl/ark-cert.pem -key-file=$(ROOT_DIR)/docker/ssl/ark-key.pem ark.localhost *.ark.localhost *.iiif.ark.localhost
Backend for Historical Manuscripts Indexing
===========================================
# Arkindex Backend
[![pipeline status](https://gitlab.teklia.com/arkindex/backend/badges/master/pipeline.svg)](https://gitlab.teklia.com/arkindex/backend/commits/master)
[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
This project is the open-source backend of Arkindex, used to manage and process image documents with Machine Learning tools.
It is licensed under the [AGPL-v3 license](./LICENSE).
## Requirements
* Clone of the [architecture](https://gitlab.teklia.com/arkindex/architecture)
* Git
* Make
* Python 3.6+
* Python 3.10+
* pip
* [virtualenvwrapper](https://virtualenvwrapper.readthedocs.io/en/latest/)
* [Docker 24+](https://docs.docker.com/engine/install/#supported-platforms)
* [mkcert](https://github.com/FiloSottile/mkcert?tab=readme-ov-file#installation)
* [GeoDjango system dependencies](https://docs.djangoproject.com/en/3.1/ref/contrib/gis/install/geolibs/): `sudo apt install binutils libproj-dev gdal-bin`
## Dev Setup
## Setup for developers
```
You'll also need the [Arkindex frontend](https://gitlab.teklia.com/arkindex/frontend) to be able to develop on the whole platform.
```console
git clone git@gitlab.teklia.com:arkindex/backend.git
git clone git@gitlab.teklia.com:arkindex/frontend.git
cd backend
mkvirtualenv ark -a .
mkvirtualenv ark -a . -p /usr/bin/python3.10
pip install -e .[test]
```
When the [architecture](https://gitlab.teklia.com/arkindex/architecture) is running locally to provide required services:
The Arkindex backend relies on some open-source services to store data and communicate to asynchronous workers.
To run all the required services, please run in a dedicated shell:
```console
make services
```
On a first run, you'll need to:
1. Configure the instance by enabling the sample configuration.
2. Populate the database structure.
3. Initialize some fields in the database.
4. Create an administration account.
All of these steps are done through:
```console
cp config.yml.sample arkindex/config.yml
arkindex migrate
arkindex bootstrap
arkindex createsuperuser
```
### Local configuration
Finally, you can run the backend:
```console
arkindex runserver
```
At this stage, you can use `http://localhost:8000/admin` to access the administration interface.
### Asycnhronous tasks
To run asynchronous tasks, run in another shell:
```console
make worker
```
For development purposes, you can customize the Arkindex settings by adding a YAML file as `arkindex/config.yml`. This file is not tracked by Git; if it exists, any configuration directive set in this file will be used for exposed settings from `settings.py`. You can view the full list of settings [on the wiki](https://wiki.vpn/en/arkindex/deploy/configuration).
### Dockerized stack
It is also possible to run the whole Arkindex stack through Docker containers. This is useful to quickly test the platform.
Another mean to customize your Arkindex instance is to add a Python file in `arkindex/project/local_settings.py`. Here you are not limited to exposed settings, and can customize any setting, or even load Python dependencies at boot time. This is not recommended, as your customization may not be available to real-world Arkindex instances.
This command will build all the required Docker images (backend & frontend) and run them as Docker containers:
### ImageMagick setup
```console
make stack
```
You'll be able to access the platform at the url `https://ark.localhost`.
PDF and image imports in Arkindex will require ImageMagick. Due to its ability to take any computer down if you give it the right parameters (for example, converting a 1000-page PDF file into JPEG files at 30 000 DPI), it has a security policy file. By default, on Ubuntu, PDF conversion is forbidden.
### Local configuration
You will need to edit the ImageMagick policy file to get PDF and Image imports to work in Arkindex. The file is located at `/etc/ImageMagick-6/policy.xml`.
For development purposes, you can customize the Arkindex settings by adding a YAML file as `arkindex/config.yml`. This file is not tracked by Git; if it exists, any configuration directive set in this file will be used for exposed settings from `settings.py`. You can view the full list of settings [on the wiki](https://redmine.teklia.com/projects/arkindex/wiki/Backend_configuration).
The line that sets the PDF policy is `<policy domain="coder" rights="none" pattern="PDF" />`. Replace `none` with `read|write` for it to work. See [this StackOverflow question](https://stackoverflow.com/questions/52998331) for more info.
Another way to customize your Arkindex instance is to add a Python file in `arkindex/project/local_settings.py`. Here you are not limited to exposed settings, and can customize any setting, or even load Python dependencies at boot time. This is not recommended, as your customization may not be available to real-world Arkindex instances.
### Local image server
......@@ -54,7 +97,7 @@ local_imageserver_id: 999
Here is how to quickly create the ImageServer using the shell:
```
```python
$ arkindex shell
>>> from arkindex.images.models import ImageServer
>>> ImageServer.objects.create(id=1, display_name='local', url='https://ark.localhost/iiif')
......@@ -62,11 +105,6 @@ $ arkindex shell
Note that this local server will only work inside Docker.
### User groups
We use a custom group model in `arkindex.users.models` (not the `django.contrib.auth` one).
In this early version groups do not define any right yet.
## Usage
### Makefile
......@@ -76,6 +114,7 @@ At the root of the repository is a Makefile that provides commands for common op
* `make` or `make all`: Clean and build;
* `make base`: Create and push the `arkindex-base` Docker image that is used to build the `arkindex-app` image;
* `make clean`: Cleanup the Python package build and cache files;
* `make clean-docker`: Deletes all running containers to avoid naming and network ports conflicts;
* `make build`: Build the arkindex Python package and recreate the `arkindex-app:latest` without pushing to the GitLab container registry;
* `make test-fixtures`: Create the unit tests fixtures on a temporary PostgreSQL database and save them to the `data.json` file used by most Django unit tests.
......@@ -83,14 +122,10 @@ At the root of the repository is a Makefile that provides commands for common op
Aside from the usual Django commands, some custom commands are available via `arkindex`:
* `build_fixtures`: Create a set of database elements designed for use by unit tests in a fixture (see `make test-fixtures`);
* `from_csv`: Import manifests and index files from a CSV list;
* `import_annotations`: Import index files from a folder into a specific volume;
* `import_acts`: Import XML surface files and CSV act files;
* `delete_corpus`: Delete a big corpus using an RQ task;
* `reindex`: Reindex elements into Solr;
* `telegraf`: A special command with InfluxDB-compatible output for Grafana statistics.
* `move_lines_to_parents`: Moves element children to their geographical parents;
* `build_fixtures`: Create a set of database elements designed for use by unit tests in a fixture (see `make test-fixtures`).
* `delete_corpus`: Delete a big corpus using an RQ task.
* `reindex`: Reindex elements into Solr.
* `move_lines_to_parents`: Moves element children to their geographical parents.
See `arkindex <command> --help` to view more details about a specific command.
......@@ -108,9 +143,9 @@ We use [pre-commit](https://pre-commit.com/) to check the Python source code syn
To be efficient, you should run pre-commit before committing (hence the name...).
To do that, run once :
To do that, run once:
```
```console
pip install pre-commit
pre-commit install
```
......@@ -127,9 +162,9 @@ IPython will give you a nicer shell with syntax highlighting, auto reloading and
[Django Debug Toolbar](https://django-debug-toolbar.readthedocs.io/en/latest/) provides you with a neat debug sidebar that will help diagnosing slow API endpoints or weird template bugs. Since the Arkindex frontend is completely decoupled from the backend, you will need to browse to an API endpoint to see the debug toolbar.
[Django Extensions](https://django-extensions.readthedocs.io/en/latest/) adds a *lot* of `arkindex` commands ; the most important one is `arkindex shell_plus` which runs the usual shell but with all the available models pre-imported. You can add your own imports with the `local_settings.py` file. Here is an example that imports most of the backend's enums and some special QuerySet features:
[Django Extensions](https://django-extensions.readthedocs.io/en/latest/) adds a *lot* of `arkindex` commands ; the most important one is `arkindex shell_plus` which runs the usual shell but with all the available models pre-imported. You can add your own imports with the `local_settings.py` file. Here is an example that imports some of the backend's enums and some special QuerySet features:
``` python
```python
SHELL_PLUS_POST_IMPORTS = [
('django.db.models', ('Value', )),
('django.db.models.functions', '*'),
......@@ -138,7 +173,7 @@ SHELL_PLUS_POST_IMPORTS = [
'Right',
)),
('arkindex.process.models', (
'DataImportMode',
'ProcessMode',
)),
('arkindex.project.aws', (
'S3FileStatus',
......@@ -148,23 +183,33 @@ SHELL_PLUS_POST_IMPORTS = [
## Asynchronous tasks
We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org/project/django-rq/), to run tasks without blocking an API request or causing timeouts. To call them in Python code, you should use the trigger methods in `arkindex.project.triggers`; those will do some safety checks to make catching some errors easier in dev. The actual tasks are in `arkindex.documents.tasks`. The following tasks exist:
We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org/project/django-rq/), to run tasks without blocking an API request or causing timeouts. To call them in Python code, you should use the trigger methods in `arkindex.project.triggers`; those will do some safety checks to make catching some errors easier in dev. The actual tasks are in `arkindex.documents.tasks`, or in other `tasks` modules within each Django app. The following tasks exist:
* Delete a corpus: `corpus_delete`
* Delete a list of elements: `element_trash`
* Delete worker results (transcriptions, classifications, etc. of a worker version): `worker_results_delete`
* Move an element to another parent: `move_element`
* Create `WorkerActivity` instances for all elements of a process: `intitialize_activity`
* Create `WorkerActivity` instances for all elements of a process: `initialize_activity`
* Delete a process and its worker activities: `process_delete`
* Export a corpus to an SQLite database: `export_corpus`
To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make slim` or `make` in the architecture will provide it. `make` in the architecture also provides a RQ worker running in Docker from a binary build.
Process tasks are run in RQ by default (Community Edition). Two RQ workers must be running at the same time to actually run a process with worker activities, so the initialisation task can wait for the worker activity task to finish:
```sh
$ arkindex rqworker -v 3 default high & arkindex rqworker -v 3 tasks
```
To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make services` will provide it. `make stack` also provides an RQ worker running in Docker from a binary build.
## Metrics
The application serves metrics for Prometheus under the `/metrics` prefix.
A specific port can be used by setting the `PROMETHEUS_METRICS_PORT` environment variable, thus separating the application from the metrics API.
## Migration from `architecture` setup
If you were using the `architecture` repository previously to run Arkindex, you'll need to migrate MinIO data from a static path on your computer towards a new docker volume.
```console
docker volume create arkindex_miniodata
mv /usr/share/arkindex/s3/data/iiif /var/lib/docker/volumes/arkindex_miniodata/_data/uploads
mv /usr/share/arkindex/s3/data/{export,iiif-cache,ponos-logs,ponos-artifacts,staging,thumbnails,training} /var/lib/docker/volumes/arkindex_miniodata/_data/
```
You will also need to setup [mkcert](https://github.com/FiloSottile/mkcert?tab=readme-ov-file#installation) as we do not use Teklia development Certificate Authority anymore. `mkcert` will take care of SSL certificates automatically, updating your browsers and system certificate store !
Finally, you can remove the `architecture` project from your work folder, as it's now archived and could be confusing.
1.5.4
1.6.0-beta3
......@@ -67,7 +67,7 @@ from arkindex.documents.serializers.elements import (
ElementNeighborsSerializer,
ElementParentSerializer,
ElementSerializer,
ElementSlimSerializer,
ElementTinySerializer,
ElementTypeSerializer,
MetaDataBulkSerializer,
MetaDataCreateSerializer,
......@@ -1410,7 +1410,7 @@ class ManageSelection(SelectionMixin, ListAPIView):
@extend_schema(
operation_id="AddSelection",
description="Add specific elements",
responses={201: ElementSlimSerializer},
responses={201: ElementTinySerializer},
request=inline_serializer(
name="AddSelectionBodySerializer",
fields={"ids": serializers.ListField(child=serializers.UUIDField())}
......@@ -1450,7 +1450,7 @@ class ManageSelection(SelectionMixin, ListAPIView):
prefetch_related_objects(elements, "corpus", "image__server", "type")
return Response(
status=status.HTTP_201_CREATED,
data=ElementSlimSerializer(
data=ElementTinySerializer(
elements,
context={"request": request},
many=True
......
......@@ -21,7 +21,7 @@ from arkindex.documents.models import (
Transcription,
TranscriptionEntity,
)
from arkindex.documents.serializers.elements import ElementSlimSerializer
from arkindex.documents.serializers.elements import ElementTinySerializer
from arkindex.documents.serializers.entities import (
BaseEntitySerializer,
CreateEntityRoleErrorResponseSerializer,
......@@ -218,7 +218,7 @@ class EntityElements(ListAPIView):
"""
Get all elements that have a link with the entity
"""
serializer_class = ElementSlimSerializer
serializer_class = ElementTinySerializer
# For OpenAPI type discovery: an entity's ID is in the path
queryset = Entity.objects.none()
......
......@@ -51,7 +51,13 @@ def run_pg_query(query, source_db):
Run a single Postgresql query and split the results into chunks.
When a name is given to a cursor, psycopg2 uses a server-side cursor; we just use a random string as a name.
"""
with connections[source_db].create_cursor(name=str(uuid.uuid4())) as pg_cursor:
db = connections[source_db]
# Make sure a connection is open and available for export databases
if source_db != "default" and db.connection is None:
db.connect()
with db.create_cursor(name=str(uuid.uuid4())) as pg_cursor:
pg_cursor.itersize = BATCH_SIZE
pg_cursor.execute(query)
......
......@@ -2,6 +2,8 @@ SELECT
dataset.id,
dataset.name,
dataset.state,
ARRAY_TO_STRING(dataset.sets, ',', '')
string_agg(datasetset.name, ',')
FROM training_dataset dataset
INNER JOIN training_datasetset datasetset ON datasetset.dataset_id = dataset.id
WHERE dataset.corpus_id = '{corpus_id}'::uuid
GROUP BY dataset.id
SELECT
dataset_element.id,
dataset_element.element_id,
dataset_element.dataset_id,
dataset_element.set
dataset_set.dataset_id,
dataset_set.name
FROM training_datasetelement dataset_element
INNER JOIN training_dataset dataset ON (dataset_element.dataset_id = dataset.id)
INNER JOIN training_datasetset dataset_set ON (dataset_element.set_id = dataset_set.id)
INNER JOIN training_dataset dataset ON (dataset_set.dataset_id = dataset.id)
WHERE dataset.corpus_id = '{corpus_id}'::uuid
This diff is collapsed.
......@@ -14,9 +14,14 @@ from arkindex.process.models import FeatureUsage, Repository, Worker, WorkerType
from arkindex.users.models import User
# Constants used in architecture project
IMAGE_SERVER_ID = 12345
IMAGE_SERVER_BUCKET = "iiif"
IMAGE_SERVER_REGION = "local"
UPLOADS_IMAGE_SERVER_ID = 12345
UPLOADS_IMAGE_SERVER_URL = "https://uploads.iiif.ark.localhost/iiif/2"
UPLOADS_IMAGE_SERVER_BUCKET = "uploads"
UPLOADS_IMAGE_SERVER_REGION = "local"
INGEST_IMAGE_SERVER_ID = 67890
INGEST_IMAGE_SERVER_URL = "https://ingest.iiif.ark.localhost/iiif/2"
INGEST_IMAGE_SERVER_BUCKET = "ingest"
INGEST_IMAGE_SERVER_REGION = "local"
PONOS_FARM_ID = "001e411a-1111-2222-3333-444455556666"
PONOS_FARM_NAME = "Bootstrap farm"
PONOS_FARM_SEED = "b12868101dab84984481741663d809d2393784894d6e807ceee0bd95051bf971"
......@@ -52,6 +57,46 @@ class Command(BaseCommand):
user.save()
self.warn(f"Updated user {user} to admin")
def create_image_server(self, id, url, bucket, region, display_name):
try:
server = ImageServer.objects.get(Q(id=id) | Q(url=url))
if server.id != id:
# Migrate existing images & server id in a single transaction
with transaction.atomic():
server.images.update(server_id=id)
ImageServer.objects.filter(id=server.id).update(id=id)
self.warn(f"Image server {server.id} updated to {id}")
# Update internal reference for updates below
server.id = id
if server.url != url:
server.url = url
server.save()
# Update base settings
if server.s3_bucket != bucket or server.s3_region != region:
server.s3_bucket = bucket
server.s3_region = region
server.save()
self.warn(f"Updated image server {server.id} S3 settings")
else:
self.success(f"Image server {server.id} valid")
except ImageServer.DoesNotExist:
try:
server = ImageServer.objects.create(
id=id,
url=url,
s3_bucket=bucket,
s3_region=region,
display_name=display_name,
)
self.success(f"Image server {server.id} created")
except IntegrityError as e:
self.fail(f"Failed to create image server: {e}")
return
return server
def handle(self, **options):
# Never allow running this script in production
if not settings.DEBUG:
......@@ -108,47 +153,18 @@ class Command(BaseCommand):
self.success(f"Created token {ADMIN_API_TOKEN}")
# an image server for local cantaloupe https://ark.localhost/iiif/2
try:
server = ImageServer.objects.get(url="https://ark.localhost/iiif/2")
if server.id != IMAGE_SERVER_ID:
# Migrate existing images & server id in a single transaction
with transaction.atomic():
server.images.update(server_id=IMAGE_SERVER_ID)
ImageServer.objects.filter(id=server.id).update(id=IMAGE_SERVER_ID)
self.warn(f"Image server {server.id} updated to {IMAGE_SERVER_ID}")
# Update internal reference for updates below
server.id = IMAGE_SERVER_ID
# Update base settings
if server.s3_bucket != IMAGE_SERVER_BUCKET or server.s3_region != IMAGE_SERVER_REGION:
server.s3_bucket = IMAGE_SERVER_BUCKET
server.s3_region = IMAGE_SERVER_REGION
server.save()
self.warn("Updated image server S3 settings")
else:
self.success(f"Image server {server.id} valid")
except ImageServer.DoesNotExist:
try:
server = ImageServer.objects.create(
id=IMAGE_SERVER_ID,
url="https://ark.localhost/iiif/2",
s3_bucket=IMAGE_SERVER_BUCKET,
s3_region=IMAGE_SERVER_REGION,
display_name="Development local server",
)
self.success("Image server created")
except IntegrityError as e:
self.fail(f"Failed to create image server: {e}")
return
uploads_server = self.create_image_server(UPLOADS_IMAGE_SERVER_ID , UPLOADS_IMAGE_SERVER_URL, UPLOADS_IMAGE_SERVER_BUCKET , UPLOADS_IMAGE_SERVER_REGION , "Local IIIF server for user uploaded files through frontend")
if uploads_server is None:
return
self.create_image_server(INGEST_IMAGE_SERVER_ID , INGEST_IMAGE_SERVER_URL, INGEST_IMAGE_SERVER_BUCKET , INGEST_IMAGE_SERVER_REGION , "Local IIIF server for ingested files from minio")
# Check there is not already a local server with invalid path
# We'll merge its image into the new one
# This bad server may have been created by automatic IIIF server detection
try:
bad_server = ImageServer.objects.get(url="https://ark.localhost/iiif")
bad_server.merge_into(server)
self.warn(f"Merged images from {bad_server.id} into {server.id}")
bad_server = ImageServer.objects.get(url="https://uploads.iiif.ark.localhost/iiif")
bad_server.merge_into(uploads_server)
self.warn(f"Merged images from {bad_server.id} into {uploads_server.id}")
bad_server.delete()
self.warn("Deleted old server")
......@@ -194,17 +210,21 @@ class Command(BaseCommand):
)
self.success(f"Created revision {revision.hash}")
version, created = worker.versions.get_or_create(
revision=revision,
defaults={
"id": IMPORT_WORKER_VERSION_ID,
"configuration": {},
"state": WorkerVersionState.Created,
"gpu_usage": FeatureUsage.Disabled,
"docker_image": None,
"docker_image_iid": None,
}
)
try:
version = WorkerVersion.objects.get(id=IMPORT_WORKER_VERSION_ID)
created = False
except WorkerVersion.DoesNotExist:
version, created = worker.versions.get_or_create(
revision=revision,
defaults={
"id": IMPORT_WORKER_VERSION_ID,
"configuration": {},
"state": WorkerVersionState.Created,
"gpu_usage": FeatureUsage.Disabled,
"docker_image": None,
"docker_image_iid": None,
}
)
if created:
self.success(f"Created worker version {version.slug}")
else:
......
#!/usr/bin/env python3
from datetime import datetime, timezone
from unittest.mock import patch
from django.contrib.gis.geos import LinearRing
......@@ -8,7 +7,7 @@ from django.utils import timezone as DjangoTimeZone
from arkindex.documents.models import Corpus, Element, MetaData, MetaType
from arkindex.images.models import Image, ImageServer
from arkindex.ponos.models import Farm, State
from arkindex.ponos.models import Farm
from arkindex.process.models import (
FeatureUsage,
Process,
......@@ -21,6 +20,7 @@ from arkindex.process.models import (
WorkerVersionState,
)
from arkindex.project.tools import fake_now
from arkindex.training.models import DatasetSet
from arkindex.users.models import Group, Right, Role, User
......@@ -104,23 +104,7 @@ class Command(BaseCommand):
farm = Farm.objects.create(name="Wheat farm")
farm.memberships.create(user=user, level=Role.Guest.value)
# Create a fake docker build with a docker image task
build_process = Process.objects.create(
farm=farm,
creator=superuser,
mode=ProcessMode.Repository,
)
build_task = build_process.tasks.create(
run=0,
depth=0,
slug="docker_build",
state=State.Completed,
# Use an expiry very far away so that task is never expired
expiry=datetime(2100, 12, 31, 23, 59, 59, 999999, timezone.utc),
)
docker_image = build_task.artifacts.create(size=42_000, path="/path/to/docker_build")
# Create some workers for the repository with their available version
# Create some workers with available versions
recognizer_worker = WorkerVersion.objects.create(
worker=worker_repo.workers.create(
name="Recognizer",
......@@ -131,7 +115,7 @@ class Command(BaseCommand):
configuration={"test": 42},
state=WorkerVersionState.Available,
model_usage=FeatureUsage.Disabled,
docker_image=docker_image
docker_image_iid="registry.somewhere.com/something:latest"
)
dla_worker = WorkerVersion.objects.create(
worker=worker_repo.workers.create(
......@@ -143,7 +127,7 @@ class Command(BaseCommand):
configuration={"test": 42},
state=WorkerVersionState.Available,
model_usage=FeatureUsage.Disabled,
docker_image=docker_image
docker_image_iid="registry.somewhere.com/something:latest"
)
WorkerVersion.objects.create(
......@@ -156,7 +140,7 @@ class Command(BaseCommand):
configuration={},
state=WorkerVersionState.Available,
model_usage=FeatureUsage.Disabled,
docker_image=docker_image,
docker_image_iid="registry.somewhere.com/something:latest"
)
WorkerVersion.objects.create(
......@@ -169,7 +153,7 @@ class Command(BaseCommand):
configuration={"test": 42},
state=WorkerVersionState.Available,
model_usage=FeatureUsage.Disabled,
docker_image=docker_image,
docker_image_iid="registry.somewhere.com/something:latest",
gpu_usage=FeatureUsage.Required
)
......@@ -185,7 +169,7 @@ class Command(BaseCommand):
state=WorkerVersionState.Available,
gpu_usage=FeatureUsage.Disabled,
model_usage=FeatureUsage.Required,
docker_image=docker_image
docker_image_iid="registry.somewhere.com/something:latest"
)
# Create a custom worker version that is not linked to a Git repository/revision
......@@ -288,8 +272,15 @@ class Command(BaseCommand):
)
# Create 2 datasets
corpus.datasets.create(name="First Dataset", description="dataset number one", creator=user)
corpus.datasets.create(name="Second Dataset", description="dataset number two", creator=user)
dataset_1 = corpus.datasets.create(name="First Dataset", description="dataset number one", creator=user)
dataset_2 = corpus.datasets.create(name="Second Dataset", description="dataset number two", creator=user)
# Create their sets
DatasetSet.objects.bulk_create(
DatasetSet(name=name, dataset_id=dataset_1.id) for name in ["training", "validation", "test"]
)
DatasetSet.objects.bulk_create(
DatasetSet(name=name, dataset_id=dataset_2.id) for name in ["training", "validation", "test"]
)
# Create 2 volumes
vol1 = Element.objects.create(
......
......@@ -37,7 +37,7 @@ from arkindex.process.models import (
WorkerType,
WorkerVersion,
)
from arkindex.training.models import Dataset, DatasetElement, Model
from arkindex.training.models import Dataset, DatasetElement, DatasetSet, Model
from arkindex.users.models import Role, User
EXPORT_VERSION = 8
......@@ -320,17 +320,30 @@ class Command(BaseCommand):
id=row["id"],
corpus=self.corpus,
name=row["name"],
sets=[r.strip() for r in row["sets"].split(",")],
creator=self.user,
description="Imported dataset",
)]
def convert_dataset_sets(self, row):
return [
DatasetSet(
name=set_name.strip(),
dataset_id=row["id"]
)
for set_name in row["sets"].split(",")
]
def map_dataset_sets(self):
return {
(str(set.dataset_id), set.name): set.id
for set in DatasetSet.objects.filter(dataset__corpus=self.corpus)
}
def convert_dataset_elements(self, row):
return [DatasetElement(
id=row["id"],
element_id=row["element_id"],
dataset_id=row["dataset_id"],
set=row["set_name"],
set_id=self.dataset_sets_map[(row["dataset_id"], row["set_name"])]
)]
def bulk_create_objects(self, ModelClass, convert_method, sql_query, ignore_conflicts=True):
......@@ -603,6 +616,12 @@ class Command(BaseCommand):
# Create datasets
self.bulk_create_objects(Dataset, self.convert_datasets, SQL_DATASET_QUERY)
# Create dataset sets
self.bulk_create_objects(DatasetSet, self.convert_dataset_sets, SQL_DATASET_QUERY)
# Create dataset sets mapping
self.dataset_sets_map = self.map_dataset_sets()
# Create dataset elements
self.bulk_create_objects(DatasetElement, self.convert_dataset_elements, SQL_ELEMENT_DATASET_QUERY)
......
import math
import uuid
from collections import defaultdict
from functools import cached_property
from textwrap import dedent
from django.conf import settings
......@@ -23,7 +24,6 @@ from arkindex.documents.serializers.light import (
from arkindex.documents.serializers.ml import ClassificationSerializer, WorkerRunSummarySerializer
from arkindex.images.models import Image
from arkindex.images.serializers import ZoneSerializer
from arkindex.ponos.utils import get_process_from_task_auth
from arkindex.process.models import WorkerVersion
from arkindex.project.fields import Array
from arkindex.project.mixins import SelectionMixin
......@@ -429,29 +429,6 @@ class ElementTinySerializer(serializers.ModelSerializer):
)
class ElementSlimSerializer(ElementTinySerializer):
"""
Fully serialises a document
"""
thumbnail_put_url = serializers.SerializerMethodField(read_only=True)
@extend_schema_field(serializers.CharField(allow_null=True))
def get_thumbnail_put_url(self, element):
"""
Only set the Thumbnail PUT URL for Ponos tasks that
are running the thumbnails generation on a folder.
"""
if element.type.folder:
process = get_process_from_task_auth(self.context["request"])
if process and process.generate_thumbnails:
return element.thumbnail.s3_put_url
class Meta(ElementTinySerializer.Meta):
model = Element
fields = ElementTinySerializer.Meta.fields + ("thumbnail_put_url",)
read_only_fields = ElementTinySerializer.Meta.read_only_fields + ("thumbnail_put_url",)
@extend_schema_serializer(deprecate_fields=("worker_version_id", ))
class ElementListSerializer(ElementTinySerializer):
created = serializers.DateTimeField(read_only=True)
......@@ -555,7 +532,7 @@ class ElementParentSerializer(serializers.Serializer):
@extend_schema_serializer(deprecate_fields=("worker_version", ))
class ElementSerializer(ElementSlimSerializer):
class ElementSerializer(ElementTinySerializer):
"""
Serialize an element with its metadata and classifications
"""
......@@ -591,9 +568,11 @@ class ElementSerializer(ElementSlimSerializer):
worker_run = WorkerRunSummarySerializer(read_only=True, allow_null=True)
thumbnail_put_url = serializers.SerializerMethodField(read_only=True)
class Meta:
model = Element
fields = ElementSlimSerializer.Meta.fields + (
fields = ElementTinySerializer.Meta.fields + (
"created",
"creator",
"rights",
......@@ -603,32 +582,57 @@ class ElementSerializer(ElementSlimSerializer):
"polygon",
"worker_version",
"confidence",
"worker_run"
"worker_run",
"thumbnail_put_url",
)
read_only_fields = ElementSlimSerializer.Meta.read_only_fields + (
read_only_fields = ElementTinySerializer.Meta.read_only_fields + (
"created",
"creator",
"rights",
"metadata_count",
"classifications",
"worker_version",
"worker_run"
"worker_run",
"thumbnail_put_url",
)
@extend_schema_field(serializers.ListField(child=serializers.ChoiceField(["read", "write", "admin"])))
def get_rights(self, element):
@cached_property
def element_rights(self):
if not self.instance:
return
user = self.context["request"].user
level = get_max_level(user, element.corpus)
level = get_max_level(user, self.instance.corpus)
# Admin access is granted to both corpus admins and element creators that are corpus contributors
if level >= Role.Contributor.value and self.instance.creator_id == user.id:
return Role.Admin.value
return level
@extend_schema_field(serializers.ListField(child=serializers.ChoiceField(["read", "write", "admin"])))
def get_rights(self, element):
rights = ["read"]
if level >= Role.Contributor.value:
if self.element_rights >= Role.Contributor.value:
rights.append("write")
# Admin access is granted to both corpus admins and element creators
if level >= Role.Admin.value or (level >= Role.Contributor.value and element.creator_id == user.id):
if self.element_rights >= Role.Admin.value:
rights.append("admin")
return rights
@extend_schema_field(serializers.CharField(
allow_null=True,
help_text=dedent("""
URL where a PUT request may be sent to upload a new thumbnail for this element.
Only available on folder elements.
Requires **admin** access to the corpus, or **contributor** access to the corpus and to be the element's creator.
"""),
))
def get_thumbnail_put_url(self, element):
if element.type.folder and self.element_rights >= Role.Admin.value:
return element.thumbnail.s3_put_url
def update(self, instance, validated_data):
image = validated_data.pop("image", None)
polygon = validated_data.pop("polygon", None)
......
......@@ -23,8 +23,8 @@ from arkindex.documents.models import (
TranscriptionEntity,
)
from arkindex.ponos.models import Task
from arkindex.process.models import Process, ProcessDataset, ProcessElement, WorkerActivity, WorkerRun
from arkindex.training.models import DatasetElement
from arkindex.process.models import Process, ProcessDatasetSet, ProcessElement, WorkerActivity, WorkerRun
from arkindex.training.models import DatasetElement, DatasetSet
from arkindex.users.models import User
logger = logging.getLogger(__name__)
......@@ -70,10 +70,11 @@ def corpus_delete(corpus_id: str) -> None:
Selection.objects.filter(element__corpus_id=corpus_id),
corpus.memberships.all(),
corpus.exports.all(),
# ProcessDataset M2M
ProcessDataset.objects.filter(dataset__corpus_id=corpus_id),
ProcessDataset.objects.filter(process__corpus_id=corpus_id),
DatasetElement.objects.filter(dataset__corpus_id=corpus_id),
# ProcessDatasetSet M2M
ProcessDatasetSet.objects.filter(set__dataset__corpus_id=corpus_id),
ProcessDatasetSet.objects.filter(process__corpus_id=corpus_id),
DatasetElement.objects.filter(set__dataset__corpus_id=corpus_id),
DatasetSet.objects.filter(dataset__corpus_id=corpus_id),
corpus.datasets.all(),
# Delete the hidden M2M task parents table
Task.parents.through.objects.filter(from_task__process__corpus_id=corpus_id),
......
......@@ -14,6 +14,7 @@ from arkindex.documents.tasks import corpus_delete
from arkindex.images.models import Image, ImageServer
from arkindex.process.models import ProcessMode, Repository, Worker, WorkerRun, WorkerType, WorkerVersion
from arkindex.project.tests import FixtureTestCase
from arkindex.training.models import Dataset, DatasetElement
BASE_DIR = Path(__file__).absolute().parent
......@@ -132,6 +133,9 @@ class TestLoadExport(FixtureTestCase):
dla_version = WorkerVersion.objects.get(worker__slug="dla")
dla_run = dla_version.worker_runs.get(process__mode=ProcessMode.Workers)
dataset_set = Dataset.objects.first().sets.first()
DatasetElement.objects.create(set=dataset_set, element=element)
element.classifications.create(
ml_class=self.corpus.ml_classes.create(name="Blah"),
confidence=.55555555,
......@@ -266,6 +270,9 @@ class TestLoadExport(FixtureTestCase):
confidence=.55555555,
)
dataset_set = Dataset.objects.first().sets.first()
DatasetElement.objects.create(set=dataset_set, element=element)
person_type = EntityType.objects.get(
name="person",
corpus=self.corpus
......
......@@ -3,9 +3,16 @@ from django.db.models.signals import pre_delete
from arkindex.documents.models import Corpus, Element, EntityType, MetaType, Transcription
from arkindex.documents.tasks import corpus_delete
from arkindex.ponos.models import Farm, State, Task
from arkindex.process.models import CorpusWorkerVersion, ProcessDataset, ProcessMode, Repository, WorkerVersion
from arkindex.process.models import (
CorpusWorkerVersion,
Process,
ProcessDatasetSet,
ProcessMode,
Repository,
WorkerVersion,
)
from arkindex.project.tests import FixtureTestCase, force_constraints_immediate
from arkindex.training.models import Dataset
from arkindex.training.models import Dataset, DatasetSet
class TestDeleteCorpus(FixtureTestCase):
......@@ -114,25 +121,32 @@ class TestDeleteCorpus(FixtureTestCase):
cls.corpus2 = Corpus.objects.create(name="Other corpus")
dataset1 = Dataset.objects.get(name="First Dataset")
dataset1.dataset_elements.create(element=element, set="test")
test_set_1 = dataset1.sets.get(name="test")
test_set_1.set_elements.create(element=element)
cls.dataset2 = Dataset.objects.create(name="Dead Sea Scrolls", description="How to trigger a Third Impact", creator=cls.user, corpus=cls.corpus2)
# Process on cls.corpus and with a dataset from cls.corpus
DatasetSet.objects.bulk_create(
DatasetSet(
dataset=cls.dataset2,
name=set_name
) for set_name in ["test", "training", "validation"]
)
# Process on cls.corpus and with a set from cls.corpus
dataset_process1 = cls.corpus.processes.create(creator=cls.user, mode=ProcessMode.Dataset)
ProcessDataset.objects.create(process=dataset_process1, dataset=dataset1, sets=dataset1.sets)
# Process on cls.corpus with a dataset from another corpus
ProcessDatasetSet.objects.create(process=dataset_process1, set=test_set_1)
# Process on cls.corpus with a set from another corpus
dataset_process2 = cls.corpus.processes.create(creator=cls.user, mode=ProcessMode.Dataset)
ProcessDataset.objects.create(process=dataset_process2, dataset=dataset1, sets=dataset1.sets)
ProcessDataset.objects.create(process=dataset_process2, dataset=cls.dataset2, sets=cls.dataset2.sets)
# Process on another corpus with a dataset from another corpus and none from cls.corpus
ProcessDatasetSet.objects.create(process=dataset_process2, set=test_set_1)
ProcessDatasetSet.objects.create(process=dataset_process2, set=cls.dataset2.sets.get(name="training"))
# Process on another corpus with a set from another corpus and none from cls.corpus
cls.dataset_process3 = cls.corpus2.processes.create(creator=cls.user, mode=ProcessMode.Dataset)
ProcessDataset.objects.create(process=cls.dataset_process3, dataset=cls.dataset2, sets=cls.dataset2.sets)
ProcessDatasetSet.objects.create(process=cls.dataset_process3, set=cls.dataset2.sets.get(name="validation"))
cls.rev = cls.repo.revisions.create(
hash="42",
message="oh",
author="me",
)
cls.process = cls.rev.processes.create(
cls.process = Process.objects.create(
creator=cls.user,
corpus=cls.corpus2,
mode=ProcessMode.Files,
......@@ -204,7 +218,6 @@ class TestDeleteCorpus(FixtureTestCase):
self.dataset_process3.refresh_from_db()
self.assertTrue(self.repo.revisions.filter(id=self.rev.id).exists())
self.assertEqual(self.process.revision, self.rev)
self.assertEqual(self.process.files.get(), self.df)
self.assertTrue(Element.objects.get_descending(self.vol.id).filter(id=self.page.id).exists())
self.assertTrue(self.corpus2.datasets.filter(id=self.dataset2.id).exists())
......