Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • arkindex/backend
1 result
Show changes
Commits on Source (14)
Showing
with 902 additions and 838 deletions
......@@ -2,6 +2,9 @@
.git
.eggs
*.egg
logs
**/__pycache__/
**/*.pyc
docker/
Makefile
test-report.xml
arkindex/config.yml
......@@ -16,3 +16,4 @@ htmlcov
*.key
arkindex/config.yml
test-report.xml
docker/ssl/*.pem
......@@ -58,7 +58,7 @@ backend-tests:
- test-report.xml
script:
- arkindex/manage.py test
- arkindex test
backend-lint:
image: python:3.10
......@@ -91,7 +91,7 @@ backend-migrations:
alias: postgres
script:
- arkindex/manage.py makemigrations --check --noinput --dry-run -v 3
- arkindex makemigrations --check --noinput --dry-run -v 3
backend-openapi:
extends: .backend-setup
......
......@@ -19,10 +19,13 @@ RUN chown -R ark:teklia /backend_static
# Copy Version file
COPY VERSION /etc/arkindex.version
HEALTHCHECK --start-period=1m --interval=1m --timeout=5s \
CMD wget --spider --quiet http://localhost/api/v1/public-key/ || exit 1
ENV PORT 8000
HEALTHCHECK --start-period=10s --interval=30s --timeout=5s \
CMD wget --spider --quiet http://localhost:$PORT/api/v1/corpus/ || exit 1
# Allow usage of django-admin by exposing our settings
ENV DJANGO_SETTINGS_MODULE "arkindex.project.settings"
# Run with Gunicorn
ENV PORT 8000
EXPOSE $PORT
CMD manage.py gunicorn --host=0.0.0.0 --port $PORT
CMD arkindex gunicorn --host=0.0.0.0 --port $PORT
ROOT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
IMAGE_TAG=registry.gitlab.teklia.com/arkindex/backend
.PHONY: all release
.PHONY: all release services
all: clean build
......@@ -16,33 +16,33 @@ build:
CI_PROJECT_DIR=$(ROOT_DIR) CI_REGISTRY_IMAGE=$(IMAGE_TAG) $(ROOT_DIR)/ci/build.sh
worker:
arkindex/manage.py rqworker -v 2 default high tasks
arkindex rqworker -v 2 default high tasks
test-fixtures:
$(eval export PGPASSWORD=devdata)
psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_dev RENAME TO arkindex_tmp_fixtures' template1
psql -h 127.0.0.1 -p 9100 -U devuser -c 'CREATE DATABASE arkindex_dev' template1
psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_dev RENAME TO arkindex_tmp_fixtures' template1
psql -h 127.0.0.1 -p 5432 -U devuser -c 'CREATE DATABASE arkindex_dev' template1
# A "try...finally" block in a Makefile: ensure we bring back the dev database even when test-fixtures fails
-$(MAKE) test-fixtures-run
$(MAKE) test-fixtures-restore
test-fixtures-run:
arkindex/manage.py migrate
arkindex/manage.py build_fixtures
arkindex/manage.py dumpdata --indent 4 process documents images users auth ponos training > arkindex/documents/fixtures/data.json
arkindex migrate
arkindex build_fixtures
arkindex dumpdata --indent 4 process documents images users auth ponos training > arkindex/documents/fixtures/data.json
test-fixtures-restore:
# This first renaming ensures that arkindex_tmp_fixtures exists; we don't want to drop arkindex_dev without a backup
psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_tmp_fixtures RENAME TO arkindex_dev_replace' template1
psql -h 127.0.0.1 -p 9100 -U devuser -c 'DROP DATABASE arkindex_dev' template1
psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_dev_replace RENAME TO arkindex_dev' template1
psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_tmp_fixtures RENAME TO arkindex_dev_replace' template1
psql -h 127.0.0.1 -p 5432 -U devuser -c 'DROP DATABASE arkindex_dev' template1
psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_dev_replace RENAME TO arkindex_dev' template1
require-version:
@if [ ! "$(version)" ]; then echo "Missing version to publish"; exit 1; fi
@git rev-parse $(version) >/dev/null 2>&1 && (echo "Version $(version) already exists on local git repo !" && exit 1) || true
schema:
./arkindex/manage.py spectacular --fail-on-warn --validate --file schema.yml
arkindex spectacular --fail-on-warn --validate --file schema.yml
release:
$(eval version:=$(shell cat VERSION))
......@@ -50,3 +50,21 @@ release:
git commit VERSION -m "Version $(version)"
git tag $(version)
git push origin master $(version)
clean-docker:
$(eval containers:=$(shell docker ps -a -q))
@if [ -n "$(containers)" ]; then \
echo "Cleaning up past containers\n" \
docker rm -f $(containers) ; \
fi
stack: docker/ssl/ark-cert.pem
docker compose -p arkindex up --build
services: docker/ssl/ark-cert.pem
docker compose -p arkindex -f docker/docker-compose.services.yml up
docker/ssl/ark-cert.pem:
$(eval export CAROOT=$(ROOT_DIR)/docker/ssl)
mkcert -install
mkcert -cert-file=$(ROOT_DIR)/docker/ssl/ark-cert.pem -key-file=$(ROOT_DIR)/docker/ssl/ark-key.pem ark.localhost *.ark.localhost *.iiif.ark.localhost
......@@ -6,43 +6,83 @@ Backend for Historical Manuscripts Indexing
## Requirements
* Clone of the [architecture](https://gitlab.teklia.com/arkindex/architecture)
* Git
* Make
* Python 3.6+
* Python 3.10+
* pip
* [virtualenvwrapper](https://virtualenvwrapper.readthedocs.io/en/latest/)
* [Docker 24+](https://docs.docker.com/engine/install/#supported-platforms)
* [mkcert](https://github.com/FiloSottile/mkcert?tab=readme-ov-file#installation)
* [GeoDjango system dependencies](https://docs.djangoproject.com/en/3.1/ref/contrib/gis/install/geolibs/): `sudo apt install binutils libproj-dev gdal-bin`
## Dev Setup
## Setup for developers
```
You'll also need the [Arkindex frontend](https://gitlab.teklia.com/arkindex/frontend) to be able to develop on the whole platform.
```console
git clone git@gitlab.teklia.com:arkindex/backend.git
git clone git@gitlab.teklia.com:arkindex/frontend.git
cd backend
mkvirtualenv ark -a .
mkvirtualenv ark -a . -p /usr/bin/python3.10
pip install -e .[test]
```
When the [architecture](https://gitlab.teklia.com/arkindex/architecture) is running locally to provide required services:
The Arkindex backend relies on some open-source services to store data and communicate to asynchronous workers.
To run all the required services, please run in a dedicated shell:
```console
make services
```
arkindex/manage.py migrate
arkindex/manage.py createsuperuser
On a first run, you'll need to:
1. Configure the instance by enabling the sample configuration.
2. Populate the database structure.
3. Initialize some fields in the database.
4. Create an administration account.
All of these steps are done through:
```console
cp config.yml.sample arkindex/config.yml
arkindex migrate
arkindex bootstrap
arkindex createsuperuser
```
### Local configuration
Finally, you can run the backend:
```console
arkindex runserver
```
At this stage, you can use `http://localhost:8000/admin` to access the administration interface.
### Asycnhronous tasks
To run asynchronous tasks, run in another shell:
```console
make worker
```
### Dockerized stack
For development purposes, you can customize the Arkindex settings by adding a YAML file as `arkindex/config.yml`. This file is not tracked by Git; if it exists, any configuration directive set in this file will be used for exposed settings from `settings.py`. You can view the full list of settings [on the wiki](https://wiki.vpn/en/arkindex/deploy/configuration).
It is also possible to run the whole Arkindex stack through Docker containers. This is useful to quickly test the platform.
This command will build all the required Docker images (backend & frontend) and run them as Docker containers:
Another mean to customize your Arkindex instance is to add a Python file in `arkindex/project/local_settings.py`. Here you are not limited to exposed settings, and can customize any setting, or even load Python dependencies at boot time. This is not recommended, as your customization may not be available to real-world Arkindex instances.
```console
make stack
```
### ImageMagick setup
You'll be able to access the platform at the url `https://ark.localhost`.
PDF and image imports in Arkindex will require ImageMagick. Due to its ability to take any computer down if you give it the right parameters (for example, converting a 1000-page PDF file into JPEG files at 30 000 DPI), it has a security policy file. By default, on Ubuntu, PDF conversion is forbidden.
### Local configuration
You will need to edit the ImageMagick policy file to get PDF and Image imports to work in Arkindex. The file is located at `/etc/ImageMagick-6/policy.xml`.
For development purposes, you can customize the Arkindex settings by adding a YAML file as `arkindex/config.yml`. This file is not tracked by Git; if it exists, any configuration directive set in this file will be used for exposed settings from `settings.py`. You can view the full list of settings [on the wiki](https://redmine.teklia.com/projects/arkindex/wiki/Backend_configuration).
The line that sets the PDF policy is `<policy domain="coder" rights="none" pattern="PDF" />`. Replace `none` with `read|write` for it to work. See [this StackOverflow question](https://stackoverflow.com/questions/52998331) for more info.
Another way to customize your Arkindex instance is to add a Python file in `arkindex/project/local_settings.py`. Here you are not limited to exposed settings, and can customize any setting, or even load Python dependencies at boot time. This is not recommended, as your customization may not be available to real-world Arkindex instances.
### Local image server
......@@ -54,19 +94,14 @@ local_imageserver_id: 999
Here is how to quickly create the ImageServer using the shell:
```
backend/arkindex$ ./manage.py shell
```python
$ arkindex shell
>>> from arkindex.images.models import ImageServer
>>> ImageServer.objects.create(id=1, display_name='local', url='https://ark.localhost/iiif')
```
Note that this local server will only work inside Docker.
### User groups
We use a custom group model in `arkindex.users.models` (not the `django.contrib.auth` one).
In this early version groups do not define any right yet.
## Usage
### Makefile
......@@ -76,31 +111,28 @@ At the root of the repository is a Makefile that provides commands for common op
* `make` or `make all`: Clean and build;
* `make base`: Create and push the `arkindex-base` Docker image that is used to build the `arkindex-app` image;
* `make clean`: Cleanup the Python package build and cache files;
* `make clean-docker`: Deletes all running containers to avoid naming and network ports conflicts;
* `make build`: Build the arkindex Python package and recreate the `arkindex-app:latest` without pushing to the GitLab container registry;
* `make test-fixtures`: Create the unit tests fixtures on a temporary PostgreSQL database and save them to the `data.json` file used by most Django unit tests.
### Django commands
Aside from the usual Django commands, some custom commands are available via `manage.py`:
Aside from the usual Django commands, some custom commands are available via `arkindex`:
* `build_fixtures`: Create a set of database elements designed for use by unit tests in a fixture (see `make test-fixtures`);
* `from_csv`: Import manifests and index files from a CSV list;
* `import_annotations`: Import index files from a folder into a specific volume;
* `import_acts`: Import XML surface files and CSV act files;
* `delete_corpus`: Delete a big corpus using an RQ task;
* `reindex`: Reindex elements into Solr;
* `telegraf`: A special command with InfluxDB-compatible output for Grafana statistics.
* `move_lines_to_parents`: Moves element children to their geographical parents;
* `build_fixtures`: Create a set of database elements designed for use by unit tests in a fixture (see `make test-fixtures`).
* `delete_corpus`: Delete a big corpus using an RQ task.
* `reindex`: Reindex elements into Solr.
* `move_lines_to_parents`: Moves element children to their geographical parents.
See `manage.py <command> --help` to view more details about a specific command.
See `arkindex <command> --help` to view more details about a specific command.
## Code validation
Once your code appears to be working on a local server, a few checks have to be performed:
* **Migrations:** Ensure that all migrations have been created by typing `./manage.py makemigrations`.
* **Unit tests:** Run `./manage.py test` to perform unit tests.
- Use `./manage.py test module_name` to perform tests on a single module, if you wish to spend less time waiting for all tests to complete.
* **Migrations:** Ensure that all migrations have been created by typing `arkindex makemigrations`.
* **Unit tests:** Run `arkindex test` to perform unit tests.
- Use `arkindex test module_name` to perform tests on a single module, if you wish to spend less time waiting for all tests to complete.
### Linting
......@@ -108,9 +140,9 @@ We use [pre-commit](https://pre-commit.com/) to check the Python source code syn
To be efficient, you should run pre-commit before committing (hence the name...).
To do that, run once :
To do that, run once:
```
```console
pip install pre-commit
pre-commit install
```
......@@ -123,13 +155,13 @@ If you want to run the full workflow on all the files: `pre-commit run -a`.
Run `pip install ipython django-debug-toolbar django_extensions` to install all the available optional dev tools for the backend.
IPython will give you a nicer shell with syntax highlighting, auto reloading and much more via `./manage.py shell`.
IPython will give you a nicer shell with syntax highlighting, auto reloading and much more via `arkindex shell`.
[Django Debug Toolbar](https://django-debug-toolbar.readthedocs.io/en/latest/) provides you with a neat debug sidebar that will help diagnosing slow API endpoints or weird template bugs. Since the Arkindex frontend is completely decoupled from the backend, you will need to browse to an API endpoint to see the debug toolbar.
[Django Extensions](https://django-extensions.readthedocs.io/en/latest/) adds a *lot* of `manage.py` commands ; the most important one is `./manage.py shell_plus` which runs the usual shell but with all the available models pre-imported. You can add your own imports with the `local_settings.py` file. Here is an example that imports most of the backend's enums and some special QuerySet features:
[Django Extensions](https://django-extensions.readthedocs.io/en/latest/) adds a *lot* of `arkindex` commands ; the most important one is `arkindex shell_plus` which runs the usual shell but with all the available models pre-imported. You can add your own imports with the `local_settings.py` file. Here is an example that imports some of the backend's enums and some special QuerySet features:
``` python
```python
SHELL_PLUS_POST_IMPORTS = [
('django.db.models', ('Value', )),
('django.db.models.functions', '*'),
......@@ -138,7 +170,7 @@ SHELL_PLUS_POST_IMPORTS = [
'Right',
)),
('arkindex.process.models', (
'DataImportMode',
'ProcessMode',
)),
('arkindex.project.aws', (
'S3FileStatus',
......@@ -148,23 +180,33 @@ SHELL_PLUS_POST_IMPORTS = [
## Asynchronous tasks
We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org/project/django-rq/), to run tasks without blocking an API request or causing timeouts. To call them in Python code, you should use the trigger methods in `arkindex.project.triggers`; those will do some safety checks to make catching some errors easier in dev. The actual tasks are in `arkindex.documents.tasks`. The following tasks exist:
We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org/project/django-rq/), to run tasks without blocking an API request or causing timeouts. To call them in Python code, you should use the trigger methods in `arkindex.project.triggers`; those will do some safety checks to make catching some errors easier in dev. The actual tasks are in `arkindex.documents.tasks`, or in other `tasks` modules within each Django app. The following tasks exist:
* Delete a corpus: `corpus_delete`
* Delete a list of elements: `element_trash`
* Delete worker results (transcriptions, classifications, etc. of a worker version): `worker_results_delete`
* Move an element to another parent: `move_element`
* Create `WorkerActivity` instances for all elements of a process: `intitialize_activity`
* Create `WorkerActivity` instances for all elements of a process: `initialize_activity`
* Delete a process and its worker activities: `process_delete`
* Export a corpus to an SQLite database: `export_corpus`
To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make slim` or `make` in the architecture will provide it. `make` in the architecture also provides a RQ worker running in Docker from a binary build.
Process tasks are run in RQ by default (Community Edition). Two RQ workers must be running at the same time to actually run a process with worker activities, so the initialisation task can wait for the worker activity task to finish:
```sh
$ manage.py rqworker -v 3 default high & manage.py rqworker -v 3 tasks
```
To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make services` will provide it. `make stack` also provides an RQ worker running in Docker from a binary build.
## Metrics
The application serves metrics for Prometheus under the `/metrics` prefix.
A specific port can be used by setting the `PROMETHEUS_METRICS_PORT` environment variable, thus separating the application from the metrics API.
## Migration from `architecture` setup
If you were using the `architecture` repository previously to run Arkindex, you'll need to migrate MinIO data from a static path on your computer towards a new docker volume.
```console
docker volume create arkindex_miniodata
mv /usr/share/arkindex/s3/data/iiif /var/lib/docker/volumes/arkindex_miniodata/_data/uploads
mv /usr/share/arkindex/s3/data/{export,iiif-cache,ponos-logs,ponos-artifacts,staging,thumbnails,training} /var/lib/docker/volumes/arkindex_miniodata/_data/
```
You will also need to setup [mkcert](https://github.com/FiloSottile/mkcert?tab=readme-ov-file#installation) as we do not use Teklia development Certificate Authority anymore. `mkcert` will take care of SSL certificates automatically, updating your browsers and system certificate store !
Finally, you can remove the `architecture` project from your work folder, as it's now archived and could be confusing.
1.5.4
1.6.0-beta2
......@@ -67,7 +67,7 @@ from arkindex.documents.serializers.elements import (
ElementNeighborsSerializer,
ElementParentSerializer,
ElementSerializer,
ElementSlimSerializer,
ElementTinySerializer,
ElementTypeSerializer,
MetaDataBulkSerializer,
MetaDataCreateSerializer,
......@@ -1410,7 +1410,7 @@ class ManageSelection(SelectionMixin, ListAPIView):
@extend_schema(
operation_id="AddSelection",
description="Add specific elements",
responses={201: ElementSlimSerializer},
responses={201: ElementTinySerializer},
request=inline_serializer(
name="AddSelectionBodySerializer",
fields={"ids": serializers.ListField(child=serializers.UUIDField())}
......@@ -1450,7 +1450,7 @@ class ManageSelection(SelectionMixin, ListAPIView):
prefetch_related_objects(elements, "corpus", "image__server", "type")
return Response(
status=status.HTTP_201_CREATED,
data=ElementSlimSerializer(
data=ElementTinySerializer(
elements,
context={"request": request},
many=True
......
......@@ -21,7 +21,7 @@ from arkindex.documents.models import (
Transcription,
TranscriptionEntity,
)
from arkindex.documents.serializers.elements import ElementSlimSerializer
from arkindex.documents.serializers.elements import ElementTinySerializer
from arkindex.documents.serializers.entities import (
BaseEntitySerializer,
CreateEntityRoleErrorResponseSerializer,
......@@ -218,7 +218,7 @@ class EntityElements(ListAPIView):
"""
Get all elements that have a link with the entity
"""
serializer_class = ElementSlimSerializer
serializer_class = ElementTinySerializer
# For OpenAPI type discovery: an entity's ID is in the path
queryset = Entity.objects.none()
......
......@@ -2,7 +2,9 @@ from datetime import timedelta
from textwrap import dedent
from django.conf import settings
from django.shortcuts import get_object_or_404
from django.utils import timezone
from django.utils.functional import cached_property
from drf_spectacular.utils import extend_schema, extend_schema_view
from rest_framework import permissions, serializers, status
from rest_framework.exceptions import PermissionDenied, ValidationError
......@@ -11,9 +13,7 @@ from rest_framework.response import Response
from arkindex.documents.models import Corpus, CorpusExport, CorpusExportState
from arkindex.documents.serializers.export import CorpusExportSerializer
from arkindex.project.mixins import CorpusACLMixin
from arkindex.project.permissions import IsVerified
from arkindex.users.models import Role
@extend_schema(tags=["exports"])
......@@ -27,47 +27,42 @@ from arkindex.users.models import Role
),
post=extend_schema(
operation_id="StartExport",
request=None,
description=dedent(
f"""
Start a corpus export job.
A user must wait for {settings.EXPORT_TTL_SECONDS} seconds after the last successful import
before being able to generate a new export of the same corpus.
before being able to generate a new export of the same corpus from the same source.
Contributor access is required.
"""
),
)
)
class CorpusExportAPIView(CorpusACLMixin, ListCreateAPIView):
class CorpusExportAPIView(ListCreateAPIView):
permission_classes = (IsVerified, )
serializer_class = CorpusExportSerializer
queryset = CorpusExport.objects.none()
@cached_property
def corpus(self):
qs = Corpus.objects.readable(self.request.user)
corpus = get_object_or_404(qs, pk=self.kwargs["pk"])
if self.request.method not in permissions.SAFE_METHODS and not corpus.is_writable(self.request.user):
raise PermissionDenied(detail="You do not have write access to this corpus.")
return corpus
def get_queryset(self):
return CorpusExport \
.objects \
.filter(corpus=self.get_corpus(self.kwargs["pk"])) \
.filter(corpus=self.corpus) \
.select_related("user") \
.order_by("-created")
def post(self, *args, **kwargs):
corpus = self.get_corpus(self.kwargs["pk"], role=Role.Contributor)
if corpus.exports.filter(state__in=(CorpusExportState.Created, CorpusExportState.Running)).exists():
raise ValidationError("An export is already running for this corpus.")
available_exports = corpus.exports.filter(
state=CorpusExportState.Done,
created__gte=timezone.now() - timedelta(seconds=settings.EXPORT_TTL_SECONDS)
)
if available_exports.exists():
raise ValidationError(f"An export has already been made for this corpus in the last {settings.EXPORT_TTL_SECONDS} seconds.")
export = corpus.exports.create(user=self.request.user)
export.start()
return Response(CorpusExportSerializer(export).data, status=status.HTTP_201_CREATED)
def get_serializer_context(self):
context = super().get_serializer_context()
context["corpus"] = self.corpus
return context
@extend_schema(
......
......@@ -46,12 +46,12 @@ EXPORT_QUERIES = [
]
def run_pg_query(query):
def run_pg_query(query, source_db):
"""
Run a single Postgresql query and split the results into chunks.
When a name is given to a cursor, psycopg2 uses a server-side cursor; we just use a random string as a name.
"""
with connections["default"].create_cursor(name=str(uuid.uuid4())) as pg_cursor:
with connections[source_db].create_cursor(name=str(uuid.uuid4())) as pg_cursor:
pg_cursor.itersize = BATCH_SIZE
pg_cursor.execute(query)
......@@ -122,7 +122,11 @@ def export_corpus(corpus_export: CorpusExport) -> None:
corpus_export.state = CorpusExportState.Running
corpus_export.save()
logger.info(f"Exporting corpus {corpus_export.corpus_id} into {db_path}")
export_source = f"{corpus_export.corpus_id}"
if corpus_export.source != "default":
export_source += f" from source {corpus_export.source}"
logger.info(f"Exporting corpus {export_source} into {db_path}")
db = sqlite3.connect(db_path)
cursor = db.cursor()
......@@ -135,7 +139,7 @@ def export_corpus(corpus_export: CorpusExport) -> None:
if rq_job:
rq_job.set_progress(i / (len(EXPORT_QUERIES) + 1))
for chunk in run_pg_query(query.format(corpus_id=corpus_export.corpus_id)):
for chunk in run_pg_query(query.format(corpus_id=corpus_export.corpus_id), corpus_export.source):
save_sqlite(chunk, name, cursor)
db.commit()
......
This diff is collapsed.
......@@ -14,9 +14,14 @@ from arkindex.process.models import FeatureUsage, Repository, Worker, WorkerType
from arkindex.users.models import User
# Constants used in architecture project
IMAGE_SERVER_ID = 12345
IMAGE_SERVER_BUCKET = "iiif"
IMAGE_SERVER_REGION = "local"
UPLOADS_IMAGE_SERVER_ID = 12345
UPLOADS_IMAGE_SERVER_URL = "https://uploads.iiif.ark.localhost/iiif/2"
UPLOADS_IMAGE_SERVER_BUCKET = "uploads"
UPLOADS_IMAGE_SERVER_REGION = "local"
INGEST_IMAGE_SERVER_ID = 67890
INGEST_IMAGE_SERVER_URL = "https://ingest.iiif.ark.localhost/iiif/2"
INGEST_IMAGE_SERVER_BUCKET = "ingest"
INGEST_IMAGE_SERVER_REGION = "local"
PONOS_FARM_ID = "001e411a-1111-2222-3333-444455556666"
PONOS_FARM_NAME = "Bootstrap farm"
PONOS_FARM_SEED = "b12868101dab84984481741663d809d2393784894d6e807ceee0bd95051bf971"
......@@ -52,6 +57,46 @@ class Command(BaseCommand):
user.save()
self.warn(f"Updated user {user} to admin")
def create_image_server(self, id, url, bucket, region, display_name):
try:
server = ImageServer.objects.get(Q(id=id) | Q(url=url))
if server.id != id:
# Migrate existing images & server id in a single transaction
with transaction.atomic():
server.images.update(server_id=id)
ImageServer.objects.filter(id=server.id).update(id=id)
self.warn(f"Image server {server.id} updated to {id}")
# Update internal reference for updates below
server.id = id
if server.url != url:
server.url = url
server.save()
# Update base settings
if server.s3_bucket != bucket or server.s3_region != region:
server.s3_bucket = bucket
server.s3_region = region
server.save()
self.warn(f"Updated image server {server.id} S3 settings")
else:
self.success(f"Image server {server.id} valid")
except ImageServer.DoesNotExist:
try:
server = ImageServer.objects.create(
id=id,
url=url,
s3_bucket=bucket,
s3_region=region,
display_name=display_name,
)
self.success(f"Image server {server.id} created")
except IntegrityError as e:
self.fail(f"Failed to create image server: {e}")
return
return server
def handle(self, **options):
# Never allow running this script in production
if not settings.DEBUG:
......@@ -108,47 +153,18 @@ class Command(BaseCommand):
self.success(f"Created token {ADMIN_API_TOKEN}")
# an image server for local cantaloupe https://ark.localhost/iiif/2
try:
server = ImageServer.objects.get(url="https://ark.localhost/iiif/2")
if server.id != IMAGE_SERVER_ID:
# Migrate existing images & server id in a single transaction
with transaction.atomic():
server.images.update(server_id=IMAGE_SERVER_ID)
ImageServer.objects.filter(id=server.id).update(id=IMAGE_SERVER_ID)
self.warn(f"Image server {server.id} updated to {IMAGE_SERVER_ID}")
# Update internal reference for updates below
server.id = IMAGE_SERVER_ID
# Update base settings
if server.s3_bucket != IMAGE_SERVER_BUCKET or server.s3_region != IMAGE_SERVER_REGION:
server.s3_bucket = IMAGE_SERVER_BUCKET
server.s3_region = IMAGE_SERVER_REGION
server.save()
self.warn("Updated image server S3 settings")
else:
self.success(f"Image server {server.id} valid")
except ImageServer.DoesNotExist:
try:
server = ImageServer.objects.create(
id=IMAGE_SERVER_ID,
url="https://ark.localhost/iiif/2",
s3_bucket=IMAGE_SERVER_BUCKET,
s3_region=IMAGE_SERVER_REGION,
display_name="Development local server",
)
self.success("Image server created")
except IntegrityError as e:
self.fail(f"Failed to create image server: {e}")
return
uploads_server = self.create_image_server(UPLOADS_IMAGE_SERVER_ID , UPLOADS_IMAGE_SERVER_URL, UPLOADS_IMAGE_SERVER_BUCKET , UPLOADS_IMAGE_SERVER_REGION , "Local IIIF server for user uploaded files through frontend")
if uploads_server is None:
return
self.create_image_server(INGEST_IMAGE_SERVER_ID , INGEST_IMAGE_SERVER_URL, INGEST_IMAGE_SERVER_BUCKET , INGEST_IMAGE_SERVER_REGION , "Local IIIF server for ingested files from minio")
# Check there is not already a local server with invalid path
# We'll merge its image into the new one
# This bad server may have been created by automatic IIIF server detection
try:
bad_server = ImageServer.objects.get(url="https://ark.localhost/iiif")
bad_server.merge_into(server)
self.warn(f"Merged images from {bad_server.id} into {server.id}")
bad_server = ImageServer.objects.get(url="https://uploads.iiif.ark.localhost/iiif")
bad_server.merge_into(uploads_server)
self.warn(f"Merged images from {bad_server.id} into {uploads_server.id}")
bad_server.delete()
self.warn("Deleted old server")
......@@ -194,17 +210,21 @@ class Command(BaseCommand):
)
self.success(f"Created revision {revision.hash}")
version, created = worker.versions.get_or_create(
revision=revision,
defaults={
"id": IMPORT_WORKER_VERSION_ID,
"configuration": {},
"state": WorkerVersionState.Created,
"gpu_usage": FeatureUsage.Disabled,
"docker_image": None,
"docker_image_iid": None,
}
)
try:
version = WorkerVersion.objects.get(id=IMPORT_WORKER_VERSION_ID)
created = False
except WorkerVersion.DoesNotExist:
version, created = worker.versions.get_or_create(
revision=revision,
defaults={
"id": IMPORT_WORKER_VERSION_ID,
"configuration": {},
"state": WorkerVersionState.Created,
"gpu_usage": FeatureUsage.Disabled,
"docker_image": None,
"docker_image_iid": None,
}
)
if created:
self.success(f"Created worker version {version.slug}")
else:
......
#!/usr/bin/env python3
from datetime import datetime, timezone
from unittest.mock import patch
from django.contrib.gis.geos import LinearRing
......@@ -8,7 +7,7 @@ from django.utils import timezone as DjangoTimeZone
from arkindex.documents.models import Corpus, Element, MetaData, MetaType
from arkindex.images.models import Image, ImageServer
from arkindex.ponos.models import Farm, State
from arkindex.ponos.models import Farm
from arkindex.process.models import (
FeatureUsage,
Process,
......@@ -104,23 +103,7 @@ class Command(BaseCommand):
farm = Farm.objects.create(name="Wheat farm")
farm.memberships.create(user=user, level=Role.Guest.value)
# Create a fake docker build with a docker image task
build_process = Process.objects.create(
farm=farm,
creator=superuser,
mode=ProcessMode.Repository,
)
build_task = build_process.tasks.create(
run=0,
depth=0,
slug="docker_build",
state=State.Completed,
# Use an expiry very far away so that task is never expired
expiry=datetime(2100, 12, 31, 23, 59, 59, 999999, timezone.utc),
)
docker_image = build_task.artifacts.create(size=42_000, path="/path/to/docker_build")
# Create some workers for the repository with their available version
# Create some workers with available versions
recognizer_worker = WorkerVersion.objects.create(
worker=worker_repo.workers.create(
name="Recognizer",
......@@ -131,7 +114,7 @@ class Command(BaseCommand):
configuration={"test": 42},
state=WorkerVersionState.Available,
model_usage=FeatureUsage.Disabled,
docker_image=docker_image
docker_image_iid="registry.somewhere.com/something:latest"
)
dla_worker = WorkerVersion.objects.create(
worker=worker_repo.workers.create(
......@@ -143,7 +126,7 @@ class Command(BaseCommand):
configuration={"test": 42},
state=WorkerVersionState.Available,
model_usage=FeatureUsage.Disabled,
docker_image=docker_image
docker_image_iid="registry.somewhere.com/something:latest"
)
WorkerVersion.objects.create(
......@@ -156,7 +139,7 @@ class Command(BaseCommand):
configuration={},
state=WorkerVersionState.Available,
model_usage=FeatureUsage.Disabled,
docker_image=docker_image,
docker_image_iid="registry.somewhere.com/something:latest"
)
WorkerVersion.objects.create(
......@@ -169,7 +152,7 @@ class Command(BaseCommand):
configuration={"test": 42},
state=WorkerVersionState.Available,
model_usage=FeatureUsage.Disabled,
docker_image=docker_image,
docker_image_iid="registry.somewhere.com/something:latest",
gpu_usage=FeatureUsage.Required
)
......@@ -185,7 +168,7 @@ class Command(BaseCommand):
state=WorkerVersionState.Available,
gpu_usage=FeatureUsage.Disabled,
model_usage=FeatureUsage.Required,
docker_image=docker_image
docker_image_iid="registry.somewhere.com/something:latest"
)
# Create a custom worker version that is not linked to a Git repository/revision
......
......@@ -57,7 +57,7 @@ class Migration(migrations.Migration):
),
),
],
# This can be removed by manage.py squashmigrations
# This can be removed by `arkindex squashmigrations`
elidable=True,
),
]
# Generated by Django 4.1.7 on 2024-02-28 15:56
from django.db import migrations, models
from arkindex.project import settings
class Migration(migrations.Migration):
dependencies = [
("documents", "0008_alter_elementtype_color_alter_entitytype_color"),
]
operations = [
migrations.AddField(
model_name="corpusexport",
name="source",
field=models.CharField(choices=[(source, source) for source in settings.EXPORT_SOURCES], default="default", max_length=50),
),
]
......@@ -73,6 +73,18 @@ class Corpus(IndexableModel):
for values in DEFAULT_CORPUS_TYPES
)
def is_writable(self, user) -> bool:
"""
Whether a user has write access to this corpus
"""
if user.is_anonymous or getattr(user, "is_agent", False):
return False
if user.is_admin:
return True
from arkindex.users.utils import get_max_level
level = get_max_level(user, self)
return level is not None and level >= Role.Contributor.value
class ElementType(models.Model):
id = models.UUIDField(default=uuid.uuid4, primary_key=True, editable=False)
......@@ -1185,6 +1197,7 @@ class CorpusExport(S3FileMixin, IndexableModel):
corpus = models.ForeignKey(Corpus, related_name="exports", on_delete=models.CASCADE)
user = models.ForeignKey(settings.AUTH_USER_MODEL, related_name="exports", on_delete=models.CASCADE)
state = EnumField(CorpusExportState, max_length=10, default=CorpusExportState.Created)
source = models.CharField(max_length=50, default="default", choices=[(source, source) for source in settings.EXPORT_SOURCES])
s3_bucket = settings.AWS_EXPORT_BUCKET
......
import math
import uuid
from collections import defaultdict
from functools import cached_property
from textwrap import dedent
from django.conf import settings
......@@ -23,7 +24,6 @@ from arkindex.documents.serializers.light import (
from arkindex.documents.serializers.ml import ClassificationSerializer, WorkerRunSummarySerializer
from arkindex.images.models import Image
from arkindex.images.serializers import ZoneSerializer
from arkindex.ponos.utils import get_process_from_task_auth
from arkindex.process.models import WorkerVersion
from arkindex.project.fields import Array
from arkindex.project.mixins import SelectionMixin
......@@ -429,29 +429,6 @@ class ElementTinySerializer(serializers.ModelSerializer):
)
class ElementSlimSerializer(ElementTinySerializer):
"""
Fully serialises a document
"""
thumbnail_put_url = serializers.SerializerMethodField(read_only=True)
@extend_schema_field(serializers.CharField(allow_null=True))
def get_thumbnail_put_url(self, element):
"""
Only set the Thumbnail PUT URL for Ponos tasks that
are running the thumbnails generation on a folder.
"""
if element.type.folder:
process = get_process_from_task_auth(self.context["request"])
if process and process.generate_thumbnails:
return element.thumbnail.s3_put_url
class Meta(ElementTinySerializer.Meta):
model = Element
fields = ElementTinySerializer.Meta.fields + ("thumbnail_put_url",)
read_only_fields = ElementTinySerializer.Meta.read_only_fields + ("thumbnail_put_url",)
@extend_schema_serializer(deprecate_fields=("worker_version_id", ))
class ElementListSerializer(ElementTinySerializer):
created = serializers.DateTimeField(read_only=True)
......@@ -555,7 +532,7 @@ class ElementParentSerializer(serializers.Serializer):
@extend_schema_serializer(deprecate_fields=("worker_version", ))
class ElementSerializer(ElementSlimSerializer):
class ElementSerializer(ElementTinySerializer):
"""
Serialize an element with its metadata and classifications
"""
......@@ -591,9 +568,11 @@ class ElementSerializer(ElementSlimSerializer):
worker_run = WorkerRunSummarySerializer(read_only=True, allow_null=True)
thumbnail_put_url = serializers.SerializerMethodField(read_only=True)
class Meta:
model = Element
fields = ElementSlimSerializer.Meta.fields + (
fields = ElementTinySerializer.Meta.fields + (
"created",
"creator",
"rights",
......@@ -603,32 +582,57 @@ class ElementSerializer(ElementSlimSerializer):
"polygon",
"worker_version",
"confidence",
"worker_run"
"worker_run",
"thumbnail_put_url",
)
read_only_fields = ElementSlimSerializer.Meta.read_only_fields + (
read_only_fields = ElementTinySerializer.Meta.read_only_fields + (
"created",
"creator",
"rights",
"metadata_count",
"classifications",
"worker_version",
"worker_run"
"worker_run",
"thumbnail_put_url",
)
@extend_schema_field(serializers.ListField(child=serializers.ChoiceField(["read", "write", "admin"])))
def get_rights(self, element):
@cached_property
def element_rights(self):
if not self.instance:
return
user = self.context["request"].user
level = get_max_level(user, element.corpus)
level = get_max_level(user, self.instance.corpus)
# Admin access is granted to both corpus admins and element creators that are corpus contributors
if level >= Role.Contributor.value and self.instance.creator_id == user.id:
return Role.Admin.value
return level
@extend_schema_field(serializers.ListField(child=serializers.ChoiceField(["read", "write", "admin"])))
def get_rights(self, element):
rights = ["read"]
if level >= Role.Contributor.value:
if self.element_rights >= Role.Contributor.value:
rights.append("write")
# Admin access is granted to both corpus admins and element creators
if level >= Role.Admin.value or (level >= Role.Contributor.value and element.creator_id == user.id):
if self.element_rights >= Role.Admin.value:
rights.append("admin")
return rights
@extend_schema_field(serializers.CharField(
allow_null=True,
help_text=dedent("""
URL where a PUT request may be sent to upload a new thumbnail for this element.
Only available on folder elements.
Requires **admin** access to the corpus, or **contributor** access to the corpus and to be the element's creator.
"""),
))
def get_thumbnail_put_url(self, element):
if element.type.folder and self.element_rights >= Role.Admin.value:
return element.thumbnail.s3_put_url
def update(self, instance, validated_data):
image = validated_data.pop("image", None)
polygon = validated_data.pop("polygon", None)
......
from datetime import timedelta
from django.conf import settings
from django.utils import timezone
from rest_framework import serializers
from rest_framework.exceptions import ValidationError
from arkindex.documents.models import CorpusExport, CorpusExportState
from arkindex.project.serializer_fields import EnumField
......@@ -6,9 +11,38 @@ from arkindex.users.serializers import SimpleUserSerializer
class CorpusExportSerializer(serializers.ModelSerializer):
user = SimpleUserSerializer()
state = EnumField(CorpusExportState)
user = SimpleUserSerializer(read_only=True)
state = EnumField(CorpusExportState, read_only=True)
class Meta:
model = CorpusExport
fields = ("id", "created", "updated", "corpus_id", "user", "state")
fields = ("id", "created", "updated", "corpus_id", "user", "state", "source",)
def validate(self, data):
corpus = self.context["corpus"]
source = data.get("source", "default")
# Check that there is no export already running for this corpus
if corpus.exports.filter(state__in=(CorpusExportState.Created, CorpusExportState.Running)).exists():
raise ValidationError("An export is already running for this corpus.")
# Check that there is no available completed export from the same source created less than {EXPORT_TTL_SECONDS}
# ago for this corpus
available_exports = corpus.exports.filter(
state=CorpusExportState.Done,
source=source,
created__gte=timezone.now() - timedelta(seconds=settings.EXPORT_TTL_SECONDS)
)
if available_exports.exists():
raise ValidationError(f"An export has already been made for this corpus in the last {settings.EXPORT_TTL_SECONDS} seconds.")
data["corpus"] = corpus
data["source"] = source
return data
def create(self, validated_data):
export = CorpusExport.objects.create(
user=self.context["request"].user,
corpus=validated_data["corpus"],
source=validated_data["source"]
)
export.start()
return export
......@@ -3,7 +3,7 @@ from django.db.models.signals import pre_delete
from arkindex.documents.models import Corpus, Element, EntityType, MetaType, Transcription
from arkindex.documents.tasks import corpus_delete
from arkindex.ponos.models import Farm, State, Task
from arkindex.process.models import CorpusWorkerVersion, ProcessDataset, ProcessMode, Repository, WorkerVersion
from arkindex.process.models import CorpusWorkerVersion, Process, ProcessDataset, ProcessMode, Repository, WorkerVersion
from arkindex.project.tests import FixtureTestCase, force_constraints_immediate
from arkindex.training.models import Dataset
......@@ -132,7 +132,7 @@ class TestDeleteCorpus(FixtureTestCase):
message="oh",
author="me",
)
cls.process = cls.rev.processes.create(
cls.process = Process.objects.create(
creator=cls.user,
corpus=cls.corpus2,
mode=ProcessMode.Files,
......@@ -204,7 +204,6 @@ class TestDeleteCorpus(FixtureTestCase):
self.dataset_process3.refresh_from_db()
self.assertTrue(self.repo.revisions.filter(id=self.rev.id).exists())
self.assertEqual(self.process.revision, self.rev)
self.assertEqual(self.process.files.get(), self.df)
self.assertTrue(Element.objects.get_descending(self.vol.id).filter(id=self.page.id).exists())
self.assertTrue(self.corpus2.datasets.filter(id=self.dataset2.id).exists())
......