Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • arkindex/backend
1 result
Show changes
Commits on Source (29)
Showing
with 148 additions and 68 deletions
......@@ -19,7 +19,6 @@ include:
before_script:
# Custom line to install our own deps from Git using GitLab CI credentials
- "pip install -e git+https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.teklia.com/arkindex/transkribus#egg=transkribus-client"
- "pip install -e git+https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.teklia.com/arkindex/license#egg=teklia-license"
- pip install -r tests-requirements.txt
- "echo 'database: {host: postgres, port: 5432}\npublic_hostname: http://ci.arkindex.localhost' > $CONFIG_PATH"
......
......@@ -7,5 +7,4 @@ use_parentheses = True
line_length = 120
default_section=FIRSTPARTY
known_first_party = transkribus
known_third_party = SolrClient,bleach,boto3,botocore,cryptography,corsheaders,django,django_admin_hstore_widget,django_rq,drf_spectacular,enumfields,gitlab,psycopg2,requests,responses,rest_framework,rq,setuptools,sqlparse,teklia_toolbox,tenacity,tripoli,yaml
......@@ -6,22 +6,12 @@ ADD . build
RUN cd build && python3 setup.py sdist
FROM registry.gitlab.teklia.com/arkindex/backend/base:gitlab-teklia
ARG TRANSKRIBUS_BRANCH=master
ARG TRANSKRIBUS_ID=63
ARG LICENSE_BRANCH=master
ARG LICENSE_ID=37
# Auth token expires on 01/07/2024
ARG GITLAB_TOKEN="glpat-3sBZPFgkZbqJxfSqjcAa"
# Install transkribus-client from private repo
RUN \
mkdir /tmp/transkribus && \
wget --header "PRIVATE-TOKEN: $GITLAB_TOKEN" https://gitlab.teklia.com/api/v4/projects/$TRANSKRIBUS_ID/repository/archive.tar.gz?sha=$TRANSKRIBUS_BRANCH -O /tmp/transkribus/archive.tar.gz && \
tar --strip-components=1 -xvf /tmp/transkribus/archive.tar.gz -C /tmp/transkribus && \
cd /tmp/transkribus && pip install --disable-pip-version-check --no-cache-dir --quiet . && \
rm -rf /tmp/transkribus
# Install teklia-license from private repo
RUN \
mkdir /tmp/teklia-license && \
......@@ -42,10 +32,10 @@ RUN chown -R ark:teklia /backend_static
# Copy Version file
COPY VERSION /etc/arkindex.version
HEALTHCHECK --start-period=1m --start-interval=1s --interval=1m --timeout=5s \
HEALTHCHECK --start-period=1m --interval=1m --timeout=5s \
CMD wget --spider --quiet http://localhost/api/v1/public-key/ || exit 1
# Run with Gunicorn
ENV PORT 80
EXPOSE 80
CMD ["manage.py", "gunicorn", "--host=0.0.0.0"]
ENV PORT 8000
EXPOSE $PORT
CMD manage.py gunicorn --host=0.0.0.0 --port $PORT
......@@ -5,8 +5,6 @@ RUN apt-get update && apt-get install --no-install-recommends -y build-essential
RUN pip install nuitka
ARG TRANSKRIBUS_BRANCH=master
ARG TRANSKRIBUS_ID=63
ARG LICENSE_BRANCH=master
ARG LICENSE_ID=37
......@@ -22,13 +20,6 @@ ADD arkindex /usr/share/arkindex
ADD base/requirements.txt /tmp/requirements-base-arkindex.txt
ADD requirements.txt /tmp/requirements-arkindex.txt
# Install transkribus-client from private repo
RUN \
mkdir /tmp/transkribus && \
wget --header "PRIVATE-TOKEN: $GITLAB_TOKEN" https://gitlab.teklia.com/api/v4/projects/$TRANSKRIBUS_ID/repository/archive.tar.gz?sha=$TRANSKRIBUS_BRANCH -O /tmp/transkribus.tar.gz && \
tar --strip-components=1 -xvf /tmp/transkribus.tar.gz -C /tmp/transkribus && \
mv /tmp/transkribus/transkribus /usr/share
# Install teklia-license from private repo
RUN \
mkdir /tmp/teklia-license && \
......@@ -38,7 +29,7 @@ RUN \
cp /tmp/teklia-license/requirements.txt /tmp/requirements-license-arkindex.txt
# Build full requirements, removing relative or remote references to arkindex projects
RUN cat /tmp/requirements-*arkindex.txt | sort | uniq | grep -v -E '^arkindex|^#|transkribus-client|teklia-license' > /requirements.txt
RUN cat /tmp/requirements-*arkindex.txt | sort | uniq | grep -v -E '^arkindex|^#|teklia-license' > /requirements.txt
# List all management commands
RUN find /usr/share/arkindex/*/management -name '*.py' -not -name '__init__.py' > /commands.txt
......@@ -56,7 +47,6 @@ ENV NUITKA_RESOURCE_MODE=linker
RUN python -m nuitka \
--nofollow-imports \
--include-package=arkindex \
--include-package=transkribus \
--include-package=teklia_license \
--show-progress \
--lto=yes \
......@@ -93,5 +83,5 @@ HEALTHCHECK --start-period=1m --start-interval=1s --interval=1m --timeout=5s \
# Run gunicorn server
ENV PORT=80
EXPOSE 80
CMD ["arkindex", "gunicorn", "--host=0.0.0.0"]
EXPOSE $PORT
CMD arkindex gunicorn --host=0.0.0.0 --port $PORT
......@@ -181,3 +181,7 @@ We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org
* Export a corpus to an SQLite database: `export_corpus`
To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make slim` or `make` in the architecture will provide it. `make` in the architecture also provides a RQ worker running in Docker from a binary build.
## Metrics
The application serves metrics for Prometheus under the `/metrics` prefix.
A specific port can be used by setting the `PROMETHEUS_METRICS_PORT` environment variable, thus separating the application from the metrics API.
1.5.1
1.5.2-beta1
......@@ -8,7 +8,19 @@ from uuid import UUID
from django.conf import settings
from django.core.exceptions import ValidationError as DjangoValidationError
from django.db import connection, transaction
from django.db.models import CharField, Count, F, FloatField, Prefetch, Q, QuerySet, Value, prefetch_related_objects
from django.db.models import (
CharField,
Count,
Exists,
F,
FloatField,
OuterRef,
Prefetch,
Q,
QuerySet,
Value,
prefetch_related_objects,
)
from django.db.models.functions import Cast
from django.shortcuts import get_object_or_404
from django.utils.functional import cached_property
......@@ -82,7 +94,7 @@ from arkindex.project.triggers import (
selection_worker_results_delete,
worker_results_delete,
)
from arkindex.training.models import ModelVersion
from arkindex.training.models import DatasetElement, ModelVersion
from arkindex.users.models import Role
from arkindex.users.utils import filter_rights
......@@ -1189,8 +1201,12 @@ class ElementChildren(ElementsListBase):
patch=extend_schema(description='Rename an element'),
put=extend_schema(description="Edit an element's attributes. Requires a write access on the corpus."),
delete=extend_schema(
description='Delete an element. Requires either an admin access on the corpus, '
'or a write access and to be the creator of this element.',
description=dedent("""
Delete an element.
This element cannot be part of a dataset.
Requires either an admin access on the corpus, or a write access and to be the creator of this element.
""").strip(),
parameters=[
OpenApiParameter(
'delete_children',
......@@ -1218,18 +1234,25 @@ class ElementRetrieve(ACLMixin, RetrieveUpdateDestroyAPIView):
queryset = Element.objects.filter(corpus__in=corpora)
if self.request and self.request.method == 'DELETE':
# Only include corpus and creator for ACL check and ID for deletion
return queryset.select_related('corpus').only('id', 'creator_id', 'corpus')
return (
queryset
.select_related('corpus')
.annotate(has_dataset=Exists(DatasetElement.objects.filter(element_id=OuterRef('pk'))))
.only('id', 'creator_id', 'corpus')
)
return queryset \
return (
queryset
.select_related(
'corpus',
'type',
'image__server',
'creator',
'worker_run'
) \
.prefetch_related(Prefetch('classifications', queryset=classifications_queryset)) \
)
.prefetch_related(Prefetch('classifications', queryset=classifications_queryset))
.annotate(metadata_count=Count('metadatas'))
)
def check_object_permissions(self, request, obj):
super().check_object_permissions(request, obj)
......@@ -1242,6 +1265,9 @@ class ElementRetrieve(ACLMixin, RetrieveUpdateDestroyAPIView):
if not self.has_access(obj.corpus, role.value):
access_repr = 'admin' if role == Role.Admin else 'write'
raise PermissionDenied(detail=f'You do not have {access_repr} access to this element.')
# Prevent the direct deletion of an element that is part of a dataset
if request.method == 'DELETE' and getattr(obj, 'has_dataset', False):
raise PermissionDenied(detail='You cannot delete an element that is part of a dataset.')
def get_serializer_context(self):
context = super().get_serializer_context()
......
......@@ -5,4 +5,5 @@ class DocumentsConfig(AppConfig):
name = 'arkindex.documents'
def ready(self):
from arkindex.documents import signals # noqa: F401
from arkindex.project import checks # noqa: F401
......@@ -41,6 +41,8 @@ EXPORT_QUERIES = [
'entity_role',
'entity_link',
'metadata',
'dataset',
'dataset_element',
]
......
SELECT
dataset.id,
dataset.name,
dataset.state,
ARRAY_TO_STRING(dataset.sets, ',', '')
FROM training_dataset dataset
WHERE dataset.corpus_id = '{corpus_id}'::uuid
SELECT
dataset_element.id,
dataset_element.element_id,
dataset_element.dataset_id,
dataset_element.set
FROM training_datasetelement dataset_element
INNER JOIN training_dataset dataset ON (dataset_element.dataset_id = dataset.id)
WHERE dataset.corpus_id = '{corpus_id}'::uuid
......@@ -37,3 +37,6 @@ CREATE INDEX metadata_element_id ON metadata (element_id);
CREATE INDEX metadata_entity_id ON metadata (entity_id);
CREATE INDEX metadata_worker_version_id ON metadata (worker_version_id);
CREATE INDEX metadata_worker_run_id ON metadata (worker_run_id);
CREATE INDEX dataset_element_element_id ON dataset_element (element_id);
CREATE INDEX dataset_element_dataset_id ON dataset_element (dataset_id);
PRAGMA foreign_keys = ON;
CREATE TABLE export_version AS SELECT 7 AS version;
CREATE TABLE export_version AS SELECT 8 AS version;
CREATE TABLE image_server (
id INTEGER NOT NULL,
......@@ -204,3 +204,21 @@ CREATE TABLE metadata (
FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE,
CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL)
);
CREATE TABLE dataset (
id VARCHAR(37) NOT NULL,
name VARCHAR(100) NOT NULL,
state VARCHAR(50) NOT NULL DEFAULT 'open',
sets TEXT NOT NULL,
PRIMARY KEY (id)
);
CREATE TABLE dataset_element (
id VARCHAR(37) NOT NULL,
element_id VARCHAR(37) NOT NULL,
dataset_id VARCHAR(37) NOT NULL,
set_name VARCHAR(50) NOT NULL,
PRIMARY KEY (id),
FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE NO ACTION,
FOREIGN KEY (dataset_id) REFERENCES dataset (id) ON DELETE NO ACTION
);
......@@ -19,7 +19,6 @@
"element_type": null,
"name_contains": null,
"load_children": false,
"collection_id": null,
"use_cache": false,
"use_gpu": false,
"template": null,
......@@ -52,7 +51,6 @@
"element_type": null,
"name_contains": null,
"load_children": false,
"collection_id": null,
"use_cache": false,
"use_gpu": false,
"template": null,
......@@ -85,7 +83,6 @@
"element_type": null,
"name_contains": null,
"load_children": false,
"collection_id": null,
"use_cache": false,
"use_gpu": false,
"template": null,
......@@ -118,7 +115,6 @@
"element_type": null,
"name_contains": null,
"load_children": false,
"collection_id": null,
"use_cache": false,
"use_gpu": false,
"template": null,
......@@ -1771,7 +1767,6 @@
"last_login": null,
"email": "root@root.fr",
"display_name": "Admin",
"transkribus_email": null,
"is_active": true,
"is_admin": true,
"verified_email": true,
......@@ -1787,7 +1782,6 @@
"last_login": null,
"email": "user@user.fr",
"display_name": "Test user",
"transkribus_email": null,
"is_active": true,
"is_admin": false,
"verified_email": true,
......@@ -1803,7 +1797,6 @@
"last_login": null,
"email": "user2@user.fr",
"display_name": "Test user write",
"transkribus_email": null,
"is_active": true,
"is_admin": false,
"verified_email": true,
......@@ -1819,7 +1812,6 @@
"last_login": null,
"email": "user3@user.fr",
"display_name": "Test user read",
"transkribus_email": null,
"is_active": true,
"is_admin": false,
"verified_email": true,
......
......@@ -2,6 +2,7 @@ import multiprocessing
import os
import sys
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.core.wsgi import get_wsgi_application
......@@ -19,7 +20,7 @@ class Command(BaseCommand):
parser.add_argument(
"--port",
type=int,
help="Port to bind gunicorn",
help="Port to bind the Arkindex application",
default=int(os.environ.get("PORT", 8000)),
)
parser.add_argument(
......@@ -35,13 +36,15 @@ class Command(BaseCommand):
except ImportError:
raise CommandError("Gunicorn is not available")
assert port != settings.PROMETHEUS_METRICS_PORT, "Application and metrics should use different ports"
# Calc max workers
workers = (multiprocessing.cpu_count() * 2) + 1
if max_workers > 0:
workers = min(workers, max_workers)
# Build bind string
bind = f"{host}:{port}"
bind = [f"{host}:{port}", f"{host}:{settings.PROMETHEUS_METRICS_PORT}"]
self.stdout.write(f"Running server on {bind} with {workers} workers")
# Do not send out CLI args to gunicorn as they are not compatible
......
......@@ -38,10 +38,10 @@ from arkindex.process.models import (
WorkerType,
WorkerVersion,
)
from arkindex.training.models import Model
from arkindex.training.models import Dataset, DatasetElement, Model
from arkindex.users.models import Role, User
EXPORT_VERSION = 7
EXPORT_VERSION = 8
TABLE_NAMES = {
'export_version',
......@@ -59,6 +59,8 @@ TABLE_NAMES = {
'transcription_entity',
'metadata',
'classification',
'dataset',
'dataset_element',
}
SQL_TABLES_QUERY = "SELECT name FROM sqlite_master WHERE type ='table' AND name NOT LIKE 'sqlite_%'"
......@@ -137,6 +139,9 @@ SQL_METADATA_QUERY = "SELECT * FROM metadata"
SQL_CLASSIFICATION_QUERY = "SELECT * FROM classification"
SQL_DATASET_QUERY = "SELECT * FROM dataset"
SQL_ELEMENT_DATASET_QUERY = "SELECT * FROM dataset_element"
class Command(BaseCommand):
help = "Import an SQLite database generated by an Arkindex export"
......@@ -307,6 +312,24 @@ class Command(BaseCommand):
worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None,
)]
def convert_datasets(self, row):
return [Dataset(
id=row['id'],
corpus=self.corpus,
name=row['name'],
sets=[r.strip() for r in row['sets'].split(',')],
creator=self.user,
description='Imported dataset',
)]
def convert_dataset_elements(self, row):
return [DatasetElement(
id=row['id'],
element_id=row['element_id'],
dataset_id=row['dataset_id'],
set=row['set_name'],
)]
def bulk_create_objects(self, ModelClass, convert_method, sql_query, ignore_conflicts=True):
# Model name for logs
verbose_name_plural = ModelClass._meta.verbose_name_plural.lower()
......@@ -438,7 +461,7 @@ class Command(BaseCommand):
level=Role.Admin.value,
)
model_version, _ = model.objects.get_or_create(id=row['model_version_id'])
model_version, _ = model.versions.get_or_create(id=row['model_version_id'])
if row['configuration_id']:
configuration, _ = WorkerConfiguration.objects.get_or_create(
......@@ -576,6 +599,12 @@ class Command(BaseCommand):
# Create classifications
self.bulk_create_objects(Classification, self.convert_classifications, SQL_CLASSIFICATION_QUERY)
# Create datasets
self.bulk_create_objects(Dataset, self.convert_datasets, SQL_DATASET_QUERY)
# Create dataset elements
self.bulk_create_objects(DatasetElement, self.convert_dataset_elements, SQL_ELEMENT_DATASET_QUERY)
self.stdout.write(self.style.SUCCESS(f"Created corpus {corpus_name} in {t.delta}"))
self.db.close()
......@@ -4,6 +4,7 @@ import django
from django.db import DJANGO_VERSION_PICKLE_KEY, connections, models
from arkindex.project.fields import Unnest
from arkindex.users.managers import BaseACLManager
from arkindex.users.models import Role
......@@ -216,16 +217,11 @@ class ElementManager(models.Manager):
return paths
class CorpusManager(models.Manager):
class CorpusManager(BaseACLManager):
'''
Add ACL functions to corpus listing
'''
def filter_rights(self, *args, **kwargs):
# Avoid circular dependencies as this module is imported by documents.models
from arkindex.users.utils import filter_rights
return filter_rights(*args, **kwargs)
def readable(self, user):
return super().get_queryset().filter(
id__in=(self.filter_rights(user, self.model, Role.Guest.value).values('id'))
......
......@@ -22,7 +22,7 @@ from arkindex.documents.dates import InterpretedDateMixin
from arkindex.documents.deletion import delete_element
from arkindex.documents.managers import CorpusManager, ElementManager
from arkindex.project.aws import S3FileMixin
from arkindex.project.default_corpus import DEFAULT_CORPUS_TYPES, DEFAULT_TRANSKRIBUS_TYPES
from arkindex.project.default_corpus import DEFAULT_CORPUS_TYPES
from arkindex.project.fields import ArrayConcat, ArrayField, LinearRingField
from arkindex.project.models import IndexableModel
......@@ -72,12 +72,6 @@ class Corpus(IndexableModel):
for values in DEFAULT_CORPUS_TYPES
)
def create_default_transkribus_types(self):
self.types.bulk_create(
ElementType(corpus=self, **values)
for values in DEFAULT_TRANSKRIBUS_TYPES
)
class ElementType(models.Model):
id = models.UUIDField(default=uuid.uuid4, primary_key=True, editable=False)
......
from corsheaders.signals import check_request_enabled
# List of endpoint open to any cross origin request
OPEN_CORS_API = (
('api', 'folder-manifest'),
('api', 'element-annotation-list'),
)
def cors_allow_any_origin(sender, request, **kwargs):
route_match = request.resolver_match
if route_match is None:
return False
return (route_match.namespace, route_match.url_name) in OPEN_CORS_API
check_request_enabled.connect(cors_allow_any_origin)
......@@ -24,6 +24,7 @@ from arkindex.documents.models import (
)
from arkindex.ponos.models import Task
from arkindex.process.models import Process, ProcessDataset, ProcessElement, WorkerActivity, WorkerRun
from arkindex.training.models import DatasetElement
from arkindex.users.models import User
logger = logging.getLogger(__name__)
......@@ -67,17 +68,18 @@ def corpus_delete(corpus_id: str) -> None:
Transcription.objects.filter(element__corpus_id=corpus_id),
ElementPath.objects.filter(element__corpus_id=corpus_id),
Selection.objects.filter(element__corpus_id=corpus_id),
corpus.elements.all(),
corpus.types.all(),
corpus.memberships.all(),
corpus.exports.all(),
# ProcessDataset M2M
ProcessDataset.objects.filter(dataset__corpus_id=corpus_id),
ProcessDataset.objects.filter(process__corpus_id=corpus_id),
DatasetElement.objects.filter(dataset__corpus_id=corpus_id),
corpus.datasets.all(),
WorkerRun.objects.filter(process__corpus_id=corpus_id),
Task.objects.filter(process__corpus_id=corpus_id),
corpus.elements.all(),
WorkerRun.objects.filter(process__corpus_id=corpus_id),
corpus.processes.all(),
corpus.types.all(),
corpus.ml_classes.all(),
Corpus.objects.filter(id=corpus_id),
]
......