Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • arkindex/backend
1 result
Show changes
Commits on Source (64)
Showing
with 174 additions and 96 deletions
# syntax=docker/dockerfile:1
FROM registry.gitlab.teklia.com/arkindex/backend/base:gitlab-teklia as build
RUN mkdir build
......@@ -41,7 +42,10 @@ RUN chown -R ark:teklia /backend_static
# Copy Version file
COPY VERSION /etc/arkindex.version
# Run with Daphne
ENV PORT 80
EXPOSE 80
CMD ["manage.py", "gunicorn", "--host=0.0.0.0"]
HEALTHCHECK --start-period=1m --interval=1m --timeout=5s \
CMD wget --spider --quiet http://localhost/api/v1/public-key/ || exit 1
# Run with Gunicorn
ENV PORT 8000
EXPOSE $PORT
CMD manage.py gunicorn --host=0.0.0.0 --port $PORT
# syntax=docker/dockerfile:1
FROM python:3.10-slim-bookworm AS compilation
RUN apt-get update && apt-get install --no-install-recommends -y build-essential wget
......@@ -87,7 +88,10 @@ COPY arkindex/documents/export/*.sql /usr/share/arkindex/documents/export/
# Otherwise Django will not load the compiled module
RUN for cmd in $(cat /usr/share/arkindex/commands.txt); do mkdir -p $(dirname $cmd); touch $cmd; done
HEALTHCHECK --start-period=1m --start-interval=1s --interval=1m --timeout=5s \
CMD wget --spider --quiet http://localhost/api/v1/public-key/ || exit 1
# Run gunicorn server
ENV PORT=80
EXPOSE 80
CMD ["arkindex", "gunicorn", "--host=0.0.0.0"]
EXPOSE $PORT
CMD arkindex gunicorn --host=0.0.0.0 --port $PORT
......@@ -181,3 +181,7 @@ We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org
* Export a corpus to an SQLite database: `export_corpus`
To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make slim` or `make` in the architecture will provide it. `make` in the architecture also provides a RQ worker running in Docker from a binary build.
## Metrics
The application serves metrics for Prometheus under the `/metrics` prefix.
A specific port can be used by setting the `PROMETHEUS_METRICS_PORT` environment variable, thus separating the application from the metrics API.
1.5.0
1.5.2-alpha2
......@@ -18,22 +18,24 @@ from arkindex.documents.models import (
MLClass,
Transcription,
)
from arkindex.users.admin import GroupMembershipInline, UserMembershipInline
class ElementTypeInline(admin.TabularInline):
model = ElementType
prepopulated_fields = {'slug': ('display_name', )}
fields = ('slug', 'display_name', 'folder', 'indexable')
readonly_fields = ('slug', 'display_name', 'folder')
def has_add_permission(self, request, obj=None):
return False
class CorpusExportInline(admin.TabularInline):
model = CorpusExport
def has_delete_permission(self, request, obj=None):
return False
class CorpusAdmin(admin.ModelAdmin):
list_display = ('id', 'name', 'public', 'top_level_type', 'created')
search_fields = ('name', )
inlines = (ElementTypeInline, UserMembershipInline, GroupMembershipInline, CorpusExportInline)
inlines = (ElementTypeInline, )
ordering = ('-created', )
def has_delete_permission(self, request, obj=None):
......
......@@ -1273,13 +1273,15 @@ class ElementNeighbors(ACLMixin, ListAPIView):
Requires a **read** access to the element's corpus.
"""
serializer_class = ElementNeighborsSerializer
pagination_class = None
# For OpenAPI type discovery
queryset = Element.objects.none()
def get_queryset(self):
element = get_object_or_404(
Element.objects.select_related('corpus').only('id', 'corpus__public'),
# Include the attributes required for ACL checks and the API response
Element.objects.select_related('corpus', 'type').only('id', 'name', 'type__slug', 'corpus__public'),
id=self.kwargs['pk']
)
......@@ -2255,7 +2257,7 @@ class WorkerResultsDestroy(CorpusACLMixin, DestroyAPIView):
errors['model_version_id'].append('Invalid UUID.')
else:
try:
model_version = ModelVersion.objects.get(id=model_version_id)
model_version = ModelVersion.objects.select_related('model').get(id=model_version_id)
except ModelVersion.DoesNotExist:
errors['model_version_id'].append('This model version does not exist.')
......
from datetime import datetime, timedelta, timezone
from datetime import timedelta
from textwrap import dedent
from django.conf import settings
from django.utils import timezone
from drf_spectacular.utils import extend_schema, extend_schema_view
from rest_framework import serializers, status
from rest_framework.exceptions import ValidationError
......@@ -12,9 +15,6 @@ from arkindex.project.mixins import CorpusACLMixin
from arkindex.project.permissions import IsVerified
from arkindex.users.models import Role
# Delay to generate a new export from a specific user
EXPORT_DELAY_HOURS = 6
@extend_schema(tags=['exports'])
@extend_schema_view(
......@@ -28,10 +28,15 @@ EXPORT_DELAY_HOURS = 6
post=extend_schema(
operation_id='StartExport',
request=None,
description=(
'Start a corpus export job.\n'
f'A user must wait {EXPORT_DELAY_HOURS} hours before being able to generate a new export of the same corpus.\n\n'
'Contributor access is required.'
description=dedent(
f"""
Start a corpus export job.
A user must wait for {settings.EXPORT_TTL_SECONDS} seconds after the last successful import
before being able to generate a new export of the same corpus.
Contributor access is required.
"""
),
)
)
......@@ -55,10 +60,10 @@ class CorpusExportAPIView(CorpusACLMixin, ListCreateAPIView):
available_exports = corpus.exports.filter(
state=CorpusExportState.Done,
created__gte=datetime.now(timezone.utc) - timedelta(hours=EXPORT_DELAY_HOURS)
created__gte=timezone.now() - timedelta(seconds=settings.EXPORT_TTL_SECONDS)
)
if available_exports.exists():
raise ValidationError(f'An export has already been made for this corpus in the last {EXPORT_DELAY_HOURS} hours.')
raise ValidationError(f'An export has already been made for this corpus in the last {settings.EXPORT_TTL_SECONDS} seconds.')
export = corpus.exports.create(user=self.request.user)
export.start()
......
......@@ -41,6 +41,8 @@ EXPORT_QUERIES = [
'entity_role',
'entity_link',
'metadata',
'dataset',
'dataset_element',
]
......
SELECT
dataset.id,
dataset.name,
dataset.state,
ARRAY_TO_STRING(dataset.sets, ',', '')
FROM training_dataset dataset
WHERE dataset.corpus_id = '{corpus_id}'::uuid
SELECT
dataset_element.id,
dataset_element.element_id,
dataset_element.dataset_id,
dataset_element.set
FROM training_datasetelement dataset_element
INNER JOIN training_dataset dataset ON (dataset_element.dataset_id = dataset.id)
WHERE dataset.corpus_id = '{corpus_id}'::uuid
......@@ -37,3 +37,6 @@ CREATE INDEX metadata_element_id ON metadata (element_id);
CREATE INDEX metadata_entity_id ON metadata (entity_id);
CREATE INDEX metadata_worker_version_id ON metadata (worker_version_id);
CREATE INDEX metadata_worker_run_id ON metadata (worker_run_id);
CREATE INDEX dataset_element_element_id ON dataset_element (element_id);
CREATE INDEX dataset_element_dataset_id ON dataset_element (dataset_id);
PRAGMA foreign_keys = ON;
CREATE TABLE export_version AS SELECT 7 AS version;
CREATE TABLE export_version AS SELECT 8 AS version;
CREATE TABLE image_server (
id INTEGER NOT NULL,
......@@ -204,3 +204,21 @@ CREATE TABLE metadata (
FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE,
CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL)
);
CREATE TABLE dataset (
id VARCHAR(37) NOT NULL,
name VARCHAR(100) NOT NULL,
state VARCHAR(50) NOT NULL DEFAULT 'open',
sets TEXT NOT NULL,
PRIMARY KEY (id)
);
CREATE TABLE dataset_element (
id VARCHAR(37) NOT NULL,
element_id VARCHAR(37) NOT NULL,
dataset_id VARCHAR(37) NOT NULL,
set_name VARCHAR(50) NOT NULL,
PRIMARY KEY (id),
FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE NO ACTION,
FOREIGN KEY (dataset_id) REFERENCES dataset (id) ON DELETE NO ACTION
);
......@@ -3927,7 +3927,6 @@
"slug": "docker_build",
"priority": 10,
"state": "completed",
"tags": "[]",
"image": "",
"shm_size": null,
"command": null,
......
......@@ -111,22 +111,24 @@ class Command(BaseCommand):
.filter(max_expiry__lt=timezone.now()) \
.exclude(id__in=worker_version_docker_image_processes)
tasks = Task.objects.filter(process__in=expired_processes)
artifacts = Artifact.objects.filter(task__process__in=expired_processes)
# A Ponos task can be linked to a Dataset when it provides the artifacts for a Dataset in a Complete state.
# As Datasets are meant to be stable, we do not want to destroy these tasks and artifacts.
tasks = Task.objects.filter(process__in=expired_processes, dataset__isnull=True)
artifacts = Artifact.objects.filter(task__in=tasks)
self.stdout.write(f'Removing {artifacts.count()} artifacts of expired processes from S3…')
for artifact in artifacts.select_related('task').iterator():
self.stdout.write(f'Removing artifact {artifact.s3.key}')
self.stdout.write(f'Removing artifact {artifact.s3_object.key}')
try:
artifact.s3.delete()
artifact.s3_delete()
except ClientError as e:
self.stdout.write(self.style.ERROR(str(e)))
self.stdout.write(f'Removing logs for {tasks.count()} tasks of expired processes from S3…')
for task in tasks.iterator():
self.stdout.write(f'Removing task log {task.s3_logs.key}')
self.stdout.write(f'Removing task log {task.logs.s3_object.key}')
try:
task.s3_logs.delete()
task.logs.s3_delete()
except ClientError as e:
self.stdout.write(self.style.ERROR(str(e)))
......
......@@ -2,6 +2,7 @@ import multiprocessing
import os
import sys
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.core.wsgi import get_wsgi_application
......@@ -19,7 +20,7 @@ class Command(BaseCommand):
parser.add_argument(
"--port",
type=int,
help="Port to bind gunicorn",
help="Port to bind the Arkindex application",
default=int(os.environ.get("PORT", 8000)),
)
parser.add_argument(
......@@ -35,13 +36,15 @@ class Command(BaseCommand):
except ImportError:
raise CommandError("Gunicorn is not available")
assert port != settings.PROMETHEUS_METRICS_PORT, "Application and metrics should use different ports"
# Calc max workers
workers = (multiprocessing.cpu_count() * 2) + 1
if max_workers > 0:
workers = min(workers, max_workers)
# Build bind string
bind = f"{host}:{port}"
bind = [f"{host}:{port}", f"{host}:{settings.PROMETHEUS_METRICS_PORT}"]
self.stdout.write(f"Running server on {bind} with {workers} workers")
# Do not send out CLI args to gunicorn as they are not compatible
......
......@@ -38,10 +38,10 @@ from arkindex.process.models import (
WorkerType,
WorkerVersion,
)
from arkindex.training.models import Model
from arkindex.training.models import Dataset, DatasetElement, Model
from arkindex.users.models import Role, User
EXPORT_VERSION = 7
EXPORT_VERSION = 8
TABLE_NAMES = {
'export_version',
......@@ -59,6 +59,8 @@ TABLE_NAMES = {
'transcription_entity',
'metadata',
'classification',
'dataset',
'dataset_element',
}
SQL_TABLES_QUERY = "SELECT name FROM sqlite_master WHERE type ='table' AND name NOT LIKE 'sqlite_%'"
......@@ -137,6 +139,9 @@ SQL_METADATA_QUERY = "SELECT * FROM metadata"
SQL_CLASSIFICATION_QUERY = "SELECT * FROM classification"
SQL_DATASET_QUERY = "SELECT * FROM dataset"
SQL_ELEMENT_DATASET_QUERY = "SELECT * FROM dataset_element"
class Command(BaseCommand):
help = "Import an SQLite database generated by an Arkindex export"
......@@ -307,6 +312,24 @@ class Command(BaseCommand):
worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None,
)]
def convert_datasets(self, row):
return [Dataset(
id=row['id'],
corpus=self.corpus,
name=row['name'],
sets=[r.strip() for r in row['sets'].split(',')],
creator=self.user,
description='Imported dataset',
)]
def convert_dataset_elements(self, row):
return [DatasetElement(
id=row['id'],
element_id=row['element_id'],
dataset_id=row['dataset_id'],
set=row['set_name'],
)]
def bulk_create_objects(self, ModelClass, convert_method, sql_query, ignore_conflicts=True):
# Model name for logs
verbose_name_plural = ModelClass._meta.verbose_name_plural.lower()
......@@ -438,7 +461,7 @@ class Command(BaseCommand):
level=Role.Admin.value,
)
model_version, _ = model.objects.get_or_create(id=row['model_version_id'])
model_version, _ = model.versions.get_or_create(id=row['model_version_id'])
if row['configuration_id']:
configuration, _ = WorkerConfiguration.objects.get_or_create(
......@@ -576,6 +599,12 @@ class Command(BaseCommand):
# Create classifications
self.bulk_create_objects(Classification, self.convert_classifications, SQL_CLASSIFICATION_QUERY)
# Create datasets
self.bulk_create_objects(Dataset, self.convert_datasets, SQL_DATASET_QUERY)
# Create dataset elements
self.bulk_create_objects(DatasetElement, self.convert_dataset_elements, SQL_ELEMENT_DATASET_QUERY)
self.stdout.write(self.style.SUCCESS(f"Created corpus {corpus_name} in {t.delta}"))
self.db.close()
......@@ -4,6 +4,7 @@ import django
from django.db import DJANGO_VERSION_PICKLE_KEY, connections, models
from arkindex.project.fields import Unnest
from arkindex.users.managers import BaseACLManager
from arkindex.users.models import Role
......@@ -193,59 +194,34 @@ class ElementManager(models.Manager):
def get_neighbors(self, element):
"""
Returns a list of neighboring ElementPaths for an element, with a prefetched `element` attribute
and a list of prefetched parent elements in the `parents` attribute.
Returns a list of neighboring ElementPaths for an element, with overridden attributes:
- `ElementPath.path` is an array of all elements in the path, instead of element IDs.
- `ElementPath.previous` is the element that precedes this one in the same parent, or None if there is none.
- `ElementPath.previous` is the element that succeeds this one in the same parent, or None if there is none.
"""
paths = list(self.get_neighbor_paths(element))
# Build a set of all IDs to load related elements (neighbors, parents) then load them into a dict
related_elt_ids = set(chain(
(element.id,),
related_elt_ids = set(filter(None, chain(
*((path.previous, path.next) for path in paths),
*(path.path for path in paths),
))
elements = {
elt.id: elt
for elt in (
self.filter(id__in=filter(None, related_elt_ids))
.select_related('type')
.only('id', 'type__slug', 'name')
)
}
)))
elements = self.select_related('type').only('id', 'type__slug', 'name').in_bulk(related_elt_ids)
# Generate an output corresponding to endpoint expectations (compatibility purpose)
output = []
for path in paths:
if path.previous:
output.append({
'ordering': path.previous_ord,
'element': elements.get(path.previous),
'parents': list(map(elements.get, path.path)),
})
output.append({
'ordering': path.ordering,
'element': elements.get(element.id),
'parents': list(map(elements.get, path.path)),
})
if path.next:
output.append({
'ordering': path.next_ord,
'element': elements.get(path.next),
'parents': list(map(elements.get, path.path)),
})
return output
class CorpusManager(models.Manager):
path.element = element
path.previous = elements.get(path.previous)
path.next = elements.get(path.next)
path.path = list(map(elements.get, path.path))
return paths
class CorpusManager(BaseACLManager):
'''
Add ACL functions to corpus listing
'''
def filter_rights(self, *args, **kwargs):
# Avoid circular dependencies as this module is imported by documents.models
from arkindex.users.utils import filter_rights
return filter_rights(*args, **kwargs)
def readable(self, user):
return super().get_queryset().filter(
id__in=(self.filter_rights(user, self.model, Role.Guest.value).values('id'))
......
......@@ -13,6 +13,7 @@ from django.core.validators import MaxValueValidator, MinValueValidator, RegexVa
from django.db import connections, models, transaction
from django.db.models import Deferrable, Q
from django.db.models.functions import Cast, Least
from django.urls import reverse
from django.utils.functional import cached_property
from enumfields import Enum, EnumField
......@@ -59,6 +60,12 @@ class Corpus(IndexableModel):
def __str__(self):
return self.name
def get_absolute_url(self):
return urljoin(
settings.PUBLIC_HOSTNAME,
reverse('frontend-corpus-details', kwargs={'pk': self.id}),
)
def create_default_types(self):
self.types.bulk_create(
ElementType(corpus=self, **values)
......
......@@ -23,7 +23,7 @@ from arkindex.documents.serializers.light import (
from arkindex.documents.serializers.ml import ClassificationSerializer, WorkerRunSummarySerializer
from arkindex.images.models import Image
from arkindex.images.serializers import ZoneSerializer
from arkindex.ponos.models import Task
from arkindex.ponos.utils import get_process_from_task_auth
from arkindex.process.models import WorkerVersion
from arkindex.project.fields import Array
from arkindex.project.mixins import SelectionMixin
......@@ -441,16 +441,10 @@ class ElementSlimSerializer(ElementTinySerializer):
Only set the Thumbnail PUT URL for Ponos tasks that
are running the thumbnails generation on a folder.
"""
# TODO: This check would be simplified to process.thumbnails once that attribute
# is available, allowing to use the get_process_from_ponos_auth helper directly.
task = self.context.get('request') and self.context['request'].auth
if (
isinstance(task, Task)
and element.type.folder
and task.image == settings.ARKINDEX_TASKS_IMAGE
and "generate_thumbnails" in task.command
):
return element.thumbnail.s3_put_url
if element.type.folder:
process = get_process_from_task_auth(self.context['request'])
if process and process.generate_thumbnails:
return element.thumbnail.s3_put_url
class Meta(ElementTinySerializer.Meta):
model = Element
......@@ -673,14 +667,23 @@ class ElementSerializer(ElementSlimSerializer):
return instance
class ElementNeighborsSerializer(serializers.Serializer):
# position attribute is left for compatibility but represents the real path ordering
position = serializers.IntegerField(source='ordering')
element = ElementLightSerializer()
parents = serializers.ListField(
class ElementNeighborsSerializer(serializers.ModelSerializer):
previous = ElementLightSerializer(allow_null=True)
next = ElementLightSerializer(allow_null=True)
path = serializers.ListField(
child=ElementLightSerializer()
)
class Meta:
model = ElementPath
fields = (
'path',
'ordering',
'previous',
'next',
)
read_only_fields = fields
@extend_schema_serializer(deprecate_fields=('worker_version', ))
class ElementCreateSerializer(ElementLightSerializer):
......
......@@ -27,12 +27,12 @@ class SolrDocumentSerializer(serializers.Serializer):
transcription_id = serializers.UUIDField(allow_null=True)
transcription_text = serializers.CharField(allow_null=True)
transcription_confidence = serializers.FloatField(min_value=0, max_value=1, required=False)
transcription_confidence = serializers.FloatField(min_value=0, max_value=1, allow_null=True)
transcription_worker = serializers.CharField(allow_null=True)
classification_id = serializers.UUIDField(allow_null=True)
classification_name = serializers.CharField(allow_null=True)
classification_confidence = serializers.FloatField(min_value=0, max_value=1, required=False)
classification_confidence = serializers.FloatField(min_value=0, max_value=1, allow_null=True)
classification_worker = serializers.CharField(allow_null=True)
metadata_id = serializers.UUIDField(allow_null=True)
......