Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • arkindex/backend
1 result
Show changes
Commits on Source (184)
Showing
with 1021 additions and 761 deletions
......@@ -11,7 +11,7 @@ include:
# For jobs that run backend scripts directly
.backend-setup:
image: registry.gitlab.com/teklia/arkindex/backend/base:bookworm
image: registry.gitlab.teklia.com/arkindex/backend/base:gitlab-teklia
cache:
paths:
......@@ -19,8 +19,7 @@ include:
before_script:
# Custom line to install our own deps from Git using GitLab CI credentials
- "pip install -e git+https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.com/teklia/arkindex/transkribus#egg=transkribus-client"
- "pip install -e git+https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.com/teklia/arkindex/license#egg=teklia-license"
- "pip install -e git+https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.teklia.com/arkindex/license#egg=teklia-license"
- pip install -r tests-requirements.txt
- "echo 'database: {host: postgres, port: 5432}\npublic_hostname: http://ci.arkindex.localhost' > $CONFIG_PATH"
......@@ -159,7 +158,7 @@ backend-build:
script:
- ci/build.sh Dockerfile
backend-build-binary:
backend-build-binary-docker:
stage: build
image: docker:19.03.1
services:
......@@ -181,6 +180,21 @@ backend-build-binary:
script:
- ci/build.sh Dockerfile.binary "-binary"
# Make sure arkindex is always compatible with Nuitka
backend-build-binary:
stage: build
image: python:3.10
before_script:
- pip install nuitka
script:
- python -m nuitka --nofollow-imports --include-package=arkindex --nofollow-import-to=*.tests arkindex/manage.py
except:
- schedules
backend-static-deploy:
stage: deploy
image: python:3-slim
......@@ -245,7 +259,7 @@ sentry-release:
release-notes:
stage: deploy
image: registry.gitlab.com/teklia/devops:latest
image: registry.gitlab.teklia.com/infra/devops:latest
rules:
- if: '$CI_COMMIT_TAG && $CI_COMMIT_TAG !~ /^base-.*/'
......@@ -257,7 +271,7 @@ release-notes:
bump-python-deps:
stage: deploy
image: registry.gitlab.com/teklia/devops:latest
image: registry.gitlab.teklia.com/infra/devops:latest
only:
- schedules
......
......@@ -7,5 +7,4 @@ use_parentheses = True
line_length = 120
default_section=FIRSTPARTY
known_first_party = transkribus
known_third_party = SolrClient,bleach,boto3,botocore,cryptography,corsheaders,django,django_admin_hstore_widget,django_rq,drf_spectacular,enumfields,gitlab,psycopg2,requests,responses,rest_framework,rq,setuptools,sqlparse,teklia_toolbox,tenacity,tripoli,yaml
FROM registry.gitlab.com/teklia/arkindex/backend/base:bookworm as build
# syntax=docker/dockerfile:1
FROM registry.gitlab.teklia.com/arkindex/backend/base:gitlab-teklia as build
RUN mkdir build
ADD . build
RUN cd build && python3 setup.py sdist
FROM registry.gitlab.com/teklia/arkindex/backend/base:bookworm
ARG TRANSKRIBUS_BRANCH=master
ARG TRANSKRIBUS_ID=11180199
FROM registry.gitlab.teklia.com/arkindex/backend/base:gitlab-teklia
ARG LICENSE_BRANCH=master
ARG LICENSE_ID=45943500
ARG GITLAB_TOKEN="gaFM7LRa9zy9QMowcUhx"
ARG LICENSE_ID=37
# Install transkribus-client from private repo
RUN \
mkdir /tmp/transkribus && \
wget --header "PRIVATE-TOKEN: $GITLAB_TOKEN" https://gitlab.com/api/v4/projects/$TRANSKRIBUS_ID/repository/archive.tar.gz?sha=$TRANSKRIBUS_BRANCH -O /tmp/transkribus/archive.tar.gz && \
tar --strip-components=1 -xvf /tmp/transkribus/archive.tar.gz -C /tmp/transkribus && \
cd /tmp/transkribus && pip install --disable-pip-version-check --no-cache-dir --quiet . && \
rm -rf /tmp/transkribus
# Auth token expires on 01/07/2024
ARG GITLAB_TOKEN="glpat-3sBZPFgkZbqJxfSqjcAa"
# Install teklia-license from private repo
RUN \
mkdir /tmp/teklia-license && \
wget --header "PRIVATE-TOKEN: $GITLAB_TOKEN" https://gitlab.com/api/v4/projects/$LICENSE_ID/repository/archive.tar.gz?sha=$LICENSE_BRANCH -O /tmp/teklia-license.tar.gz && \
wget --header "PRIVATE-TOKEN: $GITLAB_TOKEN" https://gitlab.teklia.com/api/v4/projects/$LICENSE_ID/repository/archive.tar.gz?sha=$LICENSE_BRANCH -O /tmp/teklia-license.tar.gz && \
tar --strip-components=1 -xvf /tmp/teklia-license.tar.gz -C /tmp/teklia-license && \
cd /tmp/teklia-license && pip install --disable-pip-version-check --no-cache-dir --quiet . && \
rm -rf /tmp/teklia-license
......@@ -39,7 +32,10 @@ RUN chown -R ark:teklia /backend_static
# Copy Version file
COPY VERSION /etc/arkindex.version
# Run with Daphne
ENV PORT 80
EXPOSE 80
CMD ["manage.py", "gunicorn", "--host=0.0.0.0"]
HEALTHCHECK --start-period=1m --interval=1m --timeout=5s \
CMD wget --spider --quiet http://localhost/api/v1/public-key/ || exit 1
# Run with Gunicorn
ENV PORT 8000
EXPOSE $PORT
CMD manage.py gunicorn --host=0.0.0.0 --port $PORT
# syntax=docker/dockerfile:1
FROM python:3.10-slim-bookworm AS compilation
RUN apt-get update && apt-get install --no-install-recommends -y build-essential wget
RUN pip install nuitka
ARG TRANSKRIBUS_BRANCH=master
ARG TRANSKRIBUS_ID=11180199
ARG LICENSE_BRANCH=master
ARG LICENSE_ID=45943500
ARG GITLAB_TOKEN="gaFM7LRa9zy9QMowcUhx"
ARG LICENSE_ID=37
# Auth token expires on 01/07/2024
ARG GITLAB_TOKEN="glpat-3sBZPFgkZbqJxfSqjcAa"
# We build in /usr/share because Django will try to load some files relative to that path
# once executed in the binary (management commands, ...)
......@@ -19,23 +20,16 @@ ADD arkindex /usr/share/arkindex
ADD base/requirements.txt /tmp/requirements-base-arkindex.txt
ADD requirements.txt /tmp/requirements-arkindex.txt
# Install transkribus-client from private repo
RUN \
mkdir /tmp/transkribus && \
wget --header "PRIVATE-TOKEN: $GITLAB_TOKEN" https://gitlab.com/api/v4/projects/$TRANSKRIBUS_ID/repository/archive.tar.gz?sha=$TRANSKRIBUS_BRANCH -O /tmp/transkribus.tar.gz && \
tar --strip-components=1 -xvf /tmp/transkribus.tar.gz -C /tmp/transkribus && \
mv /tmp/transkribus/transkribus /usr/share
# Install teklia-license from private repo
RUN \
mkdir /tmp/teklia-license && \
wget --header "PRIVATE-TOKEN: $GITLAB_TOKEN" https://gitlab.com/api/v4/projects/$LICENSE_ID/repository/archive.tar.gz?sha=$LICENSE_BRANCH -O /tmp/teklia-license.tar.gz && \
wget --header "PRIVATE-TOKEN: $GITLAB_TOKEN" https://gitlab.teklia.com/api/v4/projects/$LICENSE_ID/repository/archive.tar.gz?sha=$LICENSE_BRANCH -O /tmp/teklia-license.tar.gz && \
tar --strip-components=1 -xvf /tmp/teklia-license.tar.gz -C /tmp/teklia-license && \
mv /tmp/teklia-license/teklia_license /usr/share && \
cp /tmp/teklia-license/requirements.txt /tmp/requirements-license-arkindex.txt
# Build full requirements, removing relative or remote references to arkindex projects
RUN cat /tmp/requirements-*arkindex.txt | sort | uniq | grep -v -E '^arkindex|^#|transkribus-client|teklia-license' > /requirements.txt
RUN cat /tmp/requirements-*arkindex.txt | sort | uniq | grep -v -E '^arkindex|^#|teklia-license' > /requirements.txt
# List all management commands
RUN find /usr/share/arkindex/*/management -name '*.py' -not -name '__init__.py' > /commands.txt
......@@ -53,7 +47,6 @@ ENV NUITKA_RESOURCE_MODE=linker
RUN python -m nuitka \
--nofollow-imports \
--include-package=arkindex \
--include-package=transkribus \
--include-package=teklia_license \
--show-progress \
--lto=yes \
......@@ -61,7 +54,7 @@ RUN python -m nuitka \
arkindex/manage.py
# Start over from a clean setup
FROM registry.gitlab.com/teklia/arkindex/backend/base:bookworm as build
FROM registry.gitlab.teklia.com/arkindex/backend/base:gitlab-teklia as build
# Import files from compilation
RUN mkdir /usr/share/arkindex
......@@ -85,7 +78,10 @@ COPY arkindex/documents/export/*.sql /usr/share/arkindex/documents/export/
# Otherwise Django will not load the compiled module
RUN for cmd in $(cat /usr/share/arkindex/commands.txt); do mkdir -p $(dirname $cmd); touch $cmd; done
HEALTHCHECK --start-period=1m --start-interval=1s --interval=1m --timeout=5s \
CMD wget --spider --quiet http://localhost/api/v1/public-key/ || exit 1
# Run gunicorn server
ENV PORT=80
EXPOSE 80
CMD ["arkindex", "gunicorn", "--host=0.0.0.0"]
EXPOSE $PORT
CMD arkindex gunicorn --host=0.0.0.0 --port $PORT
ROOT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
IMAGE_TAG=registry.gitlab.com/teklia/arkindex/backend
IMAGE_TAG=registry.gitlab.teklia.com/arkindex/backend
.PHONY: all release
......@@ -32,7 +32,7 @@ test-fixtures:
test-fixtures-run:
arkindex/manage.py migrate
arkindex/manage.py build_fixtures
arkindex/manage.py dumpdata --indent 4 process documents images users auth ponos > arkindex/documents/fixtures/data.json
arkindex/manage.py dumpdata --indent 4 process documents images users auth ponos training > arkindex/documents/fixtures/data.json
test-fixtures-restore:
# This first renaming ensures that arkindex_tmp_fixtures exists; we don't want to drop arkindex_dev without a backup
......
Backend for Historical Manuscripts Indexing
===========================================
[![pipeline status](https://gitlab.com/teklia/arkindex/backend/badges/master/pipeline.svg)](https://gitlab.com/teklia/arkindex/backend/commits/master)
[![pipeline status](https://gitlab.teklia.com/arkindex/backend/badges/master/pipeline.svg)](https://gitlab.teklia.com/arkindex/backend/commits/master)
## Requirements
* Clone of the [architecture](https://gitlab.com/teklia/arkindex/architecture)
* Clone of the [architecture](https://gitlab.teklia.com/arkindex/architecture)
* Git
* Make
* Python 3.6+
......@@ -15,13 +15,13 @@ Backend for Historical Manuscripts Indexing
## Dev Setup
```
git clone git@gitlab.com:arkindex/backend.git
git clone git@gitlab.teklia.com:arkindex/backend.git
cd backend
mkvirtualenv ark -a .
pip install -e .[test]
```
When the [architecture](https://gitlab.com/teklia/arkindex/architecture) is running locally to provide required services:
When the [architecture](https://gitlab.teklia.com/arkindex/architecture) is running locally to provide required services:
```
arkindex/manage.py migrate
......@@ -47,7 +47,7 @@ The line that sets the PDF policy is `<policy domain="coder" rights="none" patte
Arkindex uses OAuth to let a user connect their GitLab account(s) and register Git repositories. In local development, you will need to register Arkindex as a GitLab OAuth application for it to work.
Go to GitLab's [Applications settings](https://gitlab.com/profile/applications) and create a new application with the `api` scope and add the following callback URIs:
Go to GitLab's [Applications settings](https://gitlab.teklia.com/profile/applications) and create a new application with the `api` scope and add the following callback URIs:
```
http://127.0.0.1:8000/api/v1/oauth/providers/gitlab/callback/
......@@ -181,3 +181,7 @@ We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org
* Export a corpus to an SQLite database: `export_corpus`
To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make slim` or `make` in the architecture will provide it. `make` in the architecture also provides a RQ worker running in Docker from a binary build.
## Metrics
The application serves metrics for Prometheus under the `/metrics` prefix.
A specific port can be used by setting the `PROMETHEUS_METRICS_PORT` environment variable, thus separating the application from the metrics API.
1.5.0-beta2
1.5.3
......@@ -18,22 +18,24 @@ from arkindex.documents.models import (
MLClass,
Transcription,
)
from arkindex.users.admin import GroupMembershipInline, UserMembershipInline
class ElementTypeInline(admin.TabularInline):
model = ElementType
prepopulated_fields = {'slug': ('display_name', )}
fields = ('slug', 'display_name', 'folder', 'indexable')
readonly_fields = ('slug', 'display_name', 'folder')
def has_add_permission(self, request, obj=None):
return False
class CorpusExportInline(admin.TabularInline):
model = CorpusExport
def has_delete_permission(self, request, obj=None):
return False
class CorpusAdmin(admin.ModelAdmin):
list_display = ('id', 'name', 'public', 'top_level_type', 'created')
search_fields = ('name', )
inlines = (ElementTypeInline, UserMembershipInline, GroupMembershipInline, CorpusExportInline)
inlines = (ElementTypeInline, )
ordering = ('-created', )
def has_delete_permission(self, request, obj=None):
......
......@@ -8,7 +8,18 @@ from uuid import UUID
from django.conf import settings
from django.core.exceptions import ValidationError as DjangoValidationError
from django.db import connection, transaction
from django.db.models import CharField, Count, F, FloatField, Prefetch, Q, QuerySet, Value, prefetch_related_objects
from django.db.models import (
Count,
Exists,
F,
FloatField,
OuterRef,
Prefetch,
Q,
QuerySet,
Value,
prefetch_related_objects,
)
from django.db.models.functions import Cast
from django.shortcuts import get_object_or_404
from django.utils.functional import cached_property
......@@ -68,7 +79,7 @@ from arkindex.documents.serializers.light import CorpusAllowedMetaDataSerializer
from arkindex.documents.serializers.ml import ElementTranscriptionSerializer
from arkindex.images.models import Image
from arkindex.ponos.utils import is_admin_or_ponos_task
from arkindex.process.models import WorkerRun, WorkerVersion
from arkindex.process.models import WorkerConfiguration, WorkerRun, WorkerVersion
from arkindex.project.fields import Unnest
from arkindex.project.mixins import ACLMixin, CorpusACLMixin, SelectionMixin
from arkindex.project.openapi import UUID_OR_FALSE, AutoSchema
......@@ -82,6 +93,7 @@ from arkindex.project.triggers import (
selection_worker_results_delete,
worker_results_delete,
)
from arkindex.training.models import DatasetElement, ModelVersion
from arkindex.users.models import Role
from arkindex.users.utils import filter_rights
......@@ -1138,9 +1150,9 @@ class ElementChildren(ElementsListBase):
def get_filters(self) -> Q:
filters = super().get_filters()
filters &= Q(paths__path__overlap=[self.kwargs['pk']])
if not self.is_recursive:
if self.is_recursive:
filters &= Q(paths__path__overlap=[self.kwargs['pk']])
else:
filters &= Q(paths__path__last=self.kwargs['pk'])
return filters
......@@ -1165,14 +1177,14 @@ class ElementChildren(ElementsListBase):
# Let `self.get_filters` handle filtering optimisations
if self.clean_params.get('order', 'position').lower() == 'position':
# This condition is necessary because when ordering by position ('paths__ordering')
# we run into this bug https://gitlab.com/teklia/arkindex/backend/-/issues/769 and unless
# we run into this bug https://gitlab.teklia.com/arkindex/backend/-/issues/769 and unless
# the ordering is also used in the .distinct(), this leads to some results being
# "cut off" from the response. For example, if you have 5 distinct elements, but two
# of these elements have multiple paths with different orderings within the same parent,
# the DISTINCT will work as expected during the COUNT query, and find 5 elements;
# however when next the elements are selected, the path ordering is used in the
# SELECT query which triggers the "duplicating" bug
# (https://gitlab.com/teklia/arkindex/backend/-/issues/76).This SELECT would return 7 elements
# (https://gitlab.teklia.com/arkindex/backend/-/issues/76).This SELECT would return 7 elements
# (2 duplicates) but the previous result from the COUNT would mean that only 5 of these
# 7 elements would get returned, and some elements would therefore (randomly) be missing
# from the results. Adding the 'paths__ordering' to the .distinct() here forces the
......@@ -1188,8 +1200,12 @@ class ElementChildren(ElementsListBase):
patch=extend_schema(description='Rename an element'),
put=extend_schema(description="Edit an element's attributes. Requires a write access on the corpus."),
delete=extend_schema(
description='Delete an element. Requires either an admin access on the corpus, '
'or a write access and to be the creator of this element.',
description=dedent("""
Delete an element.
This element cannot be part of a dataset.
Requires either an admin access on the corpus, or a write access and to be the creator of this element.
""").strip(),
parameters=[
OpenApiParameter(
'delete_children',
......@@ -1217,18 +1233,25 @@ class ElementRetrieve(ACLMixin, RetrieveUpdateDestroyAPIView):
queryset = Element.objects.filter(corpus__in=corpora)
if self.request and self.request.method == 'DELETE':
# Only include corpus and creator for ACL check and ID for deletion
return queryset.select_related('corpus').only('id', 'creator_id', 'corpus')
return (
queryset
.select_related('corpus')
.annotate(has_dataset=Exists(DatasetElement.objects.filter(element_id=OuterRef('pk'))))
.only('id', 'creator_id', 'corpus')
)
return queryset \
return (
queryset
.select_related(
'corpus',
'type',
'image__server',
'creator',
'worker_run'
) \
.prefetch_related(Prefetch('classifications', queryset=classifications_queryset)) \
)
.prefetch_related(Prefetch('classifications', queryset=classifications_queryset))
.annotate(metadata_count=Count('metadatas'))
)
def check_object_permissions(self, request, obj):
super().check_object_permissions(request, obj)
......@@ -1241,6 +1264,9 @@ class ElementRetrieve(ACLMixin, RetrieveUpdateDestroyAPIView):
if not self.has_access(obj.corpus, role.value):
access_repr = 'admin' if role == Role.Admin else 'write'
raise PermissionDenied(detail=f'You do not have {access_repr} access to this element.')
# Prevent the direct deletion of an element that is part of a dataset
if request.method == 'DELETE' and getattr(obj, 'has_dataset', False):
raise PermissionDenied(detail='You cannot delete an element that is part of a dataset.')
def get_serializer_context(self):
context = super().get_serializer_context()
......@@ -1263,14 +1289,6 @@ class ElementRetrieve(ACLMixin, RetrieveUpdateDestroyAPIView):
get=extend_schema(
operation_id='ListElementNeighbors',
tags=['elements'],
parameters=[
OpenApiParameter(
'n',
type={'type': 'integer', 'minimum': 1, 'maximum': 10},
description='Number of neighbors to retrieve around the element',
required=False,
)
],
)
)
class ElementNeighbors(ACLMixin, ListAPIView):
......@@ -1280,20 +1298,15 @@ class ElementNeighbors(ACLMixin, ListAPIView):
Requires a **read** access to the element's corpus.
"""
serializer_class = ElementNeighborsSerializer
pagination_class = None
# For OpenAPI type discovery
queryset = Element.objects.none()
def get_queryset(self):
n = self.request.query_params.get('n', 1)
try:
n = int(n)
except (TypeError, ValueError):
raise ValidationError({'n': 'Should be an integer between 1 and 10'})
if not 1 <= n <= 10:
raise ValidationError({'n': 'Should be an integer between 1 and 10'})
element = get_object_or_404(
Element.objects.select_related('corpus'),
# Include the attributes required for ACL checks and the API response
Element.objects.select_related('corpus', 'type').only('id', 'name', 'type__slug', 'corpus__public'),
id=self.kwargs['pk']
)
......@@ -1301,7 +1314,7 @@ class ElementNeighbors(ACLMixin, ListAPIView):
if not self.has_access(element.corpus, Role.Guest.value):
raise PermissionDenied(detail='You do not have a read access to this element.')
return Element.objects.get_neighbors(element, n)
return Element.objects.get_neighbors(element)
@extend_schema(tags=['elements'], request=None)
......@@ -1611,8 +1624,8 @@ class TranscriptionsPagination(PageNumberPagination):
class ElementTranscriptions(ListAPIView):
"""
List all transcriptions for an element, optionally filtered by type or worker version id.
Recursive parameter allow listing transcriptions on sub-elements,
otherwise element fields in the response will be set to null.
Recursive parameter allow listing transcriptions on sub-elements.
Element fields in the response are only set when using the recursive parameter.
"""
serializer_class = ElementTranscriptionSerializer
pagination_class = TranscriptionsPagination
......@@ -1641,78 +1654,83 @@ class ElementTranscriptions(ListAPIView):
return context
def get_queryset(self):
# ORDER BY casting IDs as char to avoid the PostgreSQL optimizer's inefficient scan
# TODO: See if select_related is faster than a prefetch on this endpoint
queryset = Transcription.objects \
.prefetch_related('worker_version') \
.annotate(char_id=Cast('id', output_field=CharField())) \
.order_by('char_id')
queryset = Transcription.objects.select_related('worker_run')
if self.is_recursive:
# The transcription's `element` field is only included when recursive=true,
# so we add the prefetch here
queryset = queryset.prefetch_related('element__image__server', 'element__type')
queryset = queryset.filter(element__in=(
Element
.objects
# Transcriptions from the current element
.filter(id=self.element.id)
# We are about to use a UNION; we need to explicitly say we only want the ID column in the SELECT,
# because Django will otherwise pick all Element attributes, which is not supported by `__in`.
.values('id')
# Add the current element and the child elements together using a UNION
# Using Q(element_id=…) | Q(element__in=…) makes PostgreSQL use very slow nested loops and multi-processing.
.union(
# Transcriptions from all children of the current element.
# We are not using get_descending, because it includes an ORDER BY clause and a DISTINCT clause,
# which are both unnecessary here.
ElementPath.objects.filter(path__contains=[self.element.id]).values('element_id'),
# Use UNION ALL so that PostgreSQL does not unnecessarily sort and deduplicate UUIDs
all=True,
)
))
# List and filter children results. Current element transcriptions
# are conditionally added after filtering the queryset.
queryset = (
queryset
.filter(element__paths__path__overlap=[self.element.id])
# Also filter by corpus ID for better performance
.filter(element__corpus_id=self.element.corpus_id)
# Transcription's `element` field is only included when recursive=true
.select_related('element__type')
)
else:
queryset = queryset.filter(element_id=self.element.id)
return queryset
def filter_queryset(self, queryset):
errors = {}
filters = Q()
errors = defaultdict(list)
# Filter by worker run
if 'worker_run' in self.request.query_params:
worker_run_id = self.request.query_params['worker_run']
if worker_run_id.lower() in ('false', '0'):
# Restrict to transcriptions without worker runs
queryset = queryset.filter(worker_run_id=None)
filters &= Q(worker_run_id=None)
else:
try:
queryset = queryset.filter(worker_run_id=worker_run_id)
except DjangoValidationError as e:
errors['worker_run'] = e.messages
filters &= Q(worker_run_id=uuid.UUID(worker_run_id))
except (TypeError, ValueError):
errors['worker_run'].append(f'{worker_run_id}” is not a valid UUID.')
# Filter by worker version
if 'worker_version' in self.request.query_params:
worker_version_id = self.request.query_params['worker_version']
if worker_version_id.lower() in ('false', '0'):
# Restrict to transcriptions without worker versions
queryset = queryset.filter(worker_version_id=None)
# Restrict to transcriptions without worker runs
filters &= Q(worker_version_id=None)
else:
try:
queryset = queryset.filter(worker_version_id=worker_version_id)
except DjangoValidationError as e:
errors['worker_version'] = e.messages
filters &= Q(worker_version_id=uuid.UUID(worker_version_id))
except (TypeError, ValueError):
errors['worker_version'].append(f'{worker_version_id}” is not a valid UUID.')
# Filter by element_type
element_type = self.request.query_params.get('element_type')
if element_type:
queryset = queryset.select_related('element__type').filter(element__type__slug=element_type)
elt_type_filter = self.request.query_params.get('element_type')
if elt_type_filter:
queryset = queryset.filter(element__type__slug=elt_type_filter)
if errors:
raise ValidationError(errors)
return queryset
queryset = queryset.filter(filters)
# Perform a UNION after applying filters including parent transcriptions.
# This has better performance than a OR clause,
# especially when filtering by element type.
# https://gitlab.teklia.com/arkindex/backend/-/merge_requests/2180
if (
self.is_recursive
and (
elt_type_filter is None
or elt_type_filter == self.element.type.slug
)
):
queryset = queryset.union(
(
self.element.transcriptions
.select_related('element__type', 'worker_run')
.filter(filters)
),
# No element can be duplicated here
all=True,
)
return queryset.order_by('id')
@extend_schema_view(
......@@ -2194,6 +2212,21 @@ class CorpusSelectionDestroy(CorpusACLMixin, SelectionMixin, DestroyAPIView):
description='Only delete Worker Results on selected elements in this corpus. '
'Cannot be used together with `element_id`.',
),
OpenApiParameter(
'model_version_id',
type=UUID,
required=False,
description='Only delete Worker Results produced by a specific model version.',
),
OpenApiParameter(
'configuration_id',
type=UUID_OR_FALSE,
required=False,
description=dedent("""
Only delete Worker Results produced by a specific worker configuration.
If set to false, only delete results that use no specific configuration.
""")
),
],
tags=['ml'],
)
......@@ -2246,6 +2279,38 @@ class WorkerResultsDestroy(CorpusACLMixin, DestroyAPIView):
except WorkerVersion.DoesNotExist:
errors['worker_version_id'].append('This worker version does not exist.')
model_version = None
if 'model_version_id' in self.request.query_params:
try:
model_version_id = UUID(self.request.query_params['model_version_id'])
except (TypeError, ValueError):
errors['model_version_id'].append('Invalid UUID.')
else:
try:
model_version = ModelVersion.objects.select_related('model').get(id=model_version_id)
except ModelVersion.DoesNotExist:
errors['model_version_id'].append('This model version does not exist.')
configuration = None
if 'configuration_id' in self.request.query_params:
conf_id = self.request.query_params['configuration_id']
if conf_id.lower() in ('false', '0'):
configuration = False
else:
try:
conf_id = UUID(conf_id)
except (TypeError, ValueError):
errors['configuration_id'].append(
'Invalid UUID. You can set "false" to exclude results with a configuration.'
)
else:
try:
configuration = WorkerConfiguration.objects.get(id=conf_id)
except WorkerConfiguration.DoesNotExist:
errors['configuration_id'].append(
'This worker configuration does not exist.'
)
if errors:
raise ValidationError(errors)
......@@ -2253,6 +2318,8 @@ class WorkerResultsDestroy(CorpusACLMixin, DestroyAPIView):
selection_worker_results_delete(
corpus=corpus,
version=worker_version,
model_version=model_version,
configuration=configuration,
user_id=self.request.user.id,
)
else:
......@@ -2260,6 +2327,8 @@ class WorkerResultsDestroy(CorpusACLMixin, DestroyAPIView):
corpus_id=corpus.id,
version=worker_version,
element_id=element_id,
model_version=model_version,
configuration=configuration,
user_id=self.request.user.id,
)
......
......@@ -446,10 +446,10 @@ class TranscriptionEntities(ListAPIView):
raise serializers.ValidationError(errors)
transcription = get_object_or_404(
Transcription.objects.filter(
id=self.kwargs['pk'],
element__corpus__in=Corpus.objects.readable(self.request.user),
).only("id")
Transcription.objects
.using("default")
.filter(id=self.kwargs['pk'], element__corpus__in=Corpus.objects.readable(self.request.user))
.only("id")
)
return (
......
from datetime import datetime, timedelta, timezone
from datetime import timedelta
from textwrap import dedent
from django.conf import settings
from django.utils import timezone
from drf_spectacular.utils import extend_schema, extend_schema_view
from rest_framework import serializers, status
from rest_framework.exceptions import ValidationError
......@@ -12,9 +15,6 @@ from arkindex.project.mixins import CorpusACLMixin
from arkindex.project.permissions import IsVerified
from arkindex.users.models import Role
# Delay to generate a new export from a specific user
EXPORT_DELAY_HOURS = 6
@extend_schema(tags=['exports'])
@extend_schema_view(
......@@ -28,10 +28,15 @@ EXPORT_DELAY_HOURS = 6
post=extend_schema(
operation_id='StartExport',
request=None,
description=(
'Start a corpus export job.\n'
f'A user must wait {EXPORT_DELAY_HOURS} hours before being able to generate a new export of the same corpus.\n\n'
'Contributor access is required.'
description=dedent(
f"""
Start a corpus export job.
A user must wait for {settings.EXPORT_TTL_SECONDS} seconds after the last successful import
before being able to generate a new export of the same corpus.
Contributor access is required.
"""
),
)
)
......@@ -55,10 +60,10 @@ class CorpusExportAPIView(CorpusACLMixin, ListCreateAPIView):
available_exports = corpus.exports.filter(
state=CorpusExportState.Done,
created__gte=datetime.now(timezone.utc) - timedelta(hours=EXPORT_DELAY_HOURS)
created__gte=timezone.now() - timedelta(seconds=settings.EXPORT_TTL_SECONDS)
)
if available_exports.exists():
raise ValidationError(f'An export has already been made for this corpus in the last {EXPORT_DELAY_HOURS} hours.')
raise ValidationError(f'An export has already been made for this corpus in the last {settings.EXPORT_TTL_SECONDS} seconds.')
export = corpus.exports.create(user=self.request.user)
export.start()
......
......@@ -54,6 +54,8 @@ class TranscriptionCreate(ACLMixin, CreateAPIView):
def get_object(self):
if not hasattr(self, 'element'):
self.element = super().get_object()
if not self.has_access(self.element.corpus, Role.Contributor.value):
raise PermissionDenied(detail="A write access to the element's corpus is required.")
return self.element
def get_queryset(self):
......@@ -65,8 +67,11 @@ class TranscriptionCreate(ACLMixin, CreateAPIView):
# We retrieve the readable objects then check permissions
# instead of retrieving writable objects directly so as not to
# get 404_NOT_FOUND errors on elements the user has access to.
return Element.objects.using('default').filter(
corpus__in=Corpus.objects.readable(self.request.user)
return (
Element.objects
.using('default')
.filter(corpus__in=Corpus.objects.readable(self.request.user))
.select_related('corpus')
)
def get_serializer_context(self):
......@@ -75,23 +80,10 @@ class TranscriptionCreate(ACLMixin, CreateAPIView):
context['element'] = self.get_object()
return context
def check_object_permissions(self, request, context):
super().check_object_permissions(request, context)
role = Role.Contributor
detail = "A write access to the element's corpus is required."
if not self.has_access(context.corpus, role.value):
raise PermissionDenied(detail=detail)
def perform_create(self, serializer):
return Transcription.objects.create(
element=self.element,
**serializer.validated_data
)
def create(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
obj = self.perform_create(serializer)
obj = serializer.save()
headers = self.get_success_headers(serializer.data)
return Response(
# Use a single transcription serializer for the response
......
......@@ -5,4 +5,5 @@ class DocumentsConfig(AppConfig):
name = 'arkindex.documents'
def ready(self):
from arkindex.documents import signals # noqa: F401
from arkindex.project import checks # noqa: F401
......@@ -41,6 +41,8 @@ EXPORT_QUERIES = [
'entity_role',
'entity_link',
'metadata',
'dataset',
'dataset_element',
]
......
SELECT
dataset.id,
dataset.name,
dataset.state,
ARRAY_TO_STRING(dataset.sets, ',', '')
FROM training_dataset dataset
WHERE dataset.corpus_id = '{corpus_id}'::uuid
SELECT
dataset_element.id,
dataset_element.element_id,
dataset_element.dataset_id,
dataset_element.set
FROM training_datasetelement dataset_element
INNER JOIN training_dataset dataset ON (dataset_element.dataset_id = dataset.id)
WHERE dataset.corpus_id = '{corpus_id}'::uuid
......@@ -37,3 +37,6 @@ CREATE INDEX metadata_element_id ON metadata (element_id);
CREATE INDEX metadata_entity_id ON metadata (entity_id);
CREATE INDEX metadata_worker_version_id ON metadata (worker_version_id);
CREATE INDEX metadata_worker_run_id ON metadata (worker_run_id);
CREATE INDEX dataset_element_element_id ON dataset_element (element_id);
CREATE INDEX dataset_element_dataset_id ON dataset_element (dataset_id);
PRAGMA foreign_keys = ON;
CREATE TABLE export_version AS SELECT 7 AS version;
CREATE TABLE export_version AS SELECT 8 AS version;
CREATE TABLE image_server (
id INTEGER NOT NULL,
......@@ -30,9 +30,12 @@ CREATE TABLE worker_version (
name VARCHAR(100) NOT NULL,
slug VARCHAR(100) NOT NULL,
type VARCHAR(50) NOT NULL,
revision VARCHAR(50) NOT NULL,
repository_url TEXT NOT NULL,
PRIMARY KEY (id)
version INTEGER,
revision VARCHAR(50),
repository_url TEXT,
PRIMARY KEY (id),
CHECK ((version IS NULL) <> (revision IS NULL)),
CHECK ((revision IS NULL) == (repository_url IS NULL))
);
CREATE TABLE worker_run (
......@@ -204,3 +207,21 @@ CREATE TABLE metadata (
FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE,
CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL)
);
CREATE TABLE dataset (
id VARCHAR(37) NOT NULL,
name VARCHAR(100) NOT NULL,
state VARCHAR(50) NOT NULL DEFAULT 'open',
sets TEXT NOT NULL,
PRIMARY KEY (id)
);
CREATE TABLE dataset_element (
id VARCHAR(37) NOT NULL,
element_id VARCHAR(37) NOT NULL,
dataset_id VARCHAR(37) NOT NULL,
set_name VARCHAR(50) NOT NULL,
PRIMARY KEY (id),
FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE NO ACTION,
FOREIGN KEY (dataset_id) REFERENCES dataset (id) ON DELETE NO ACTION
);
......@@ -3,12 +3,12 @@
-- fills up the RAM. Adding DISTINCT to all the SELECT queries of the UNION
-- slows this query down by ~20%. Using multiple INs instead of a UNION makes
-- this query twice as slow.
SELECT version.id, worker.name, worker.slug, workertype.slug, revision.hash, repository.url
SELECT version.id, worker.name, worker.slug, workertype.slug, version.version, revision.hash, repository.url
FROM process_workerversion version
INNER JOIN process_worker worker ON (version.worker_id = worker.id)
INNER JOIN process_workertype workertype ON (worker.type_id = workertype.id)
INNER JOIN process_repository repository ON (worker.repository_id = repository.id)
INNER JOIN process_revision revision ON (version.revision_id = revision.id)
LEFT JOIN process_repository repository ON (worker.repository_id = repository.id)
LEFT JOIN process_revision revision ON (version.revision_id = revision.id)
WHERE version.id IN (
SELECT worker_version_id FROM documents_element WHERE corpus_id = '{corpus_id}'::uuid
UNION
......
This diff is collapsed.