Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • arkindex/backend
1 result
Show changes
Commits on Source (42)
Showing
with 459 additions and 424 deletions
......@@ -11,7 +11,7 @@ include:
# For jobs that run backend scripts directly
.backend-setup:
image: registry.gitlab.com/arkindex/backend/base:django-4
image: registry.gitlab.com/arkindex/backend/base:django-4.0.2
cache:
paths:
......
FROM registry.gitlab.com/arkindex/backend/base:django-4 as build
FROM registry.gitlab.com/arkindex/backend/base:django-4.0.2 as build
RUN mkdir build
ADD . build
RUN cd build && python3 setup.py sdist
FROM registry.gitlab.com/arkindex/backend/base:django-4
FROM registry.gitlab.com/arkindex/backend/base:django-4.0.2
ARG PONOS_BRANCH=master
ARG PONOS_ID=10017043
ARG TRANSKRIBUS_BRANCH=master
......
......@@ -61,7 +61,7 @@ RUN python -m nuitka \
arkindex/manage.py
# Start over from a clean setup
FROM registry.gitlab.com/arkindex/backend/base:django-4 as build
FROM registry.gitlab.com/arkindex/backend/base:django-4.0.2 as build
# Import files from compilation
RUN mkdir /usr/share/arkindex
......
1.2.0-beta3
1.2.0
......@@ -1125,13 +1125,18 @@ class WorkerRunList(WorkerACLMixin, ListCreateAPIView):
# User must have admin access to the process project and a contributor access to the worker run
if not self.has_access(process.corpus, Role.Admin.value):
raise PermissionDenied(detail='You do not have an admin access to the corpus of this process.')
try:
worker = Worker.objects.get(versions__id=serializer.validated_data['version_id'])
except Worker.DoesNotExist:
raise ValidationError({'worker_version_id': ['This version does not exist.']})
if not self.has_execution_access(worker):
raise ValidationError({'worker_version_id': ['You do not have an execution access to this version.']})
if process.worker_runs.filter(version_id=serializer.validated_data['version_id']).exists():
raise ValidationError({'worker_version_id': ['A WorkerRun with this version already exists on this process.']})
configuration = serializer.validated_data.pop('configuration_id', None)
if configuration and configuration.worker_id != worker.id:
raise ValidationError({'configuration_id': ['The configuration must be part of the same worker.']})
......
#!/usr/bin/env python3
import logging
import uuid
from django.core.management.base import BaseCommand
from django.core.management.base import BaseCommand, CommandError
from arkindex.dataimport.models import Repository, RepositoryType, Revision, Worker, WorkerVersion
logging.basicConfig(
level=logging.INFO,
format='[%(levelname)s] %(message)s',
)
logger = logging.getLogger(__name__)
class Command(BaseCommand):
help = 'Create a fake worker version'
......@@ -37,7 +30,7 @@ class Command(BaseCommand):
versions = WorkerVersion.objects.filter(worker__slug=slug)
if versions.exists():
versions_id = list(versions.values_list('id', flat=True).distinct())
logger.info(f"This worker already has versions: {versions_id}")
raise CommandError(f"This worker already has versions: {versions_id}")
return
repo, _ = Repository.objects.get_or_create(
......@@ -68,4 +61,4 @@ class Command(BaseCommand):
configuration={"configuration": {}},
)
logger.info(f"Created a worker version: {version.id}")
self.stdout.write(self.style.SUCCESS(f"Created a worker version: {version.id}"))
#!/usr/bin/env python3
import logging
import yaml
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
......@@ -11,12 +9,6 @@ from arkindex.project.argparse import CorpusArgument
from arkindex.users.models import User
from ponos.models import Workflow
logging.basicConfig(
level=logging.INFO,
format='[%(levelname)s] %(message)s',
)
logger = logging.getLogger(__name__)
IMPORT_NAME = 'import-s3'
MAX_DEPTH_WARN = 3
......@@ -90,10 +82,9 @@ class Command(BaseCommand):
# Warn for high recursion level
depth = options['max_folder_depth']
if depth > MAX_DEPTH_WARN:
logger.warning(
'Maximum folder depth set to {}. A high value can considerably slow tasks import distribution'
.format(depth)
)
self.stderr.write(self.style.WARNING(
f'Maximum folder depth set to {depth}. A high value can considerably slow tasks import distribution'
))
iiif_cache_bucket = options.get('iiif_cache_bucket')
......@@ -146,7 +137,7 @@ class Command(BaseCommand):
recipe.setdefault('env', {}).update(env_vars)
workflow = Workflow.objects.create(farm_id=get_default_farm_id(), recipe=yaml.dump(recipe))
logger.info('Created Workflow with id {}'.format(workflow.id))
self.stdout.write('Created Workflow with id {}'.format(workflow.id))
admin = User.objects.filter(is_admin=True).first()
assert admin is not None, 'No admin user has been found to create a Dataimport'
......@@ -157,7 +148,7 @@ class Command(BaseCommand):
corpus_id=options['corpus'].id,
creator_id=admin.id
)
logger.info(
self.stdout.write(self.style.SUCCESS(
'Linked Workflow to DataImport {0} using user {1.email} ({1.id})'
.format(dataimport.id, admin)
)
))
import logging
from django.core.management.base import BaseCommand
from arkindex.dataimport.models import Repository
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger()
class Command(BaseCommand):
help = "Update all Git repositories hooks"
......@@ -23,7 +15,7 @@ class Command(BaseCommand):
def handle(self, base_url, *args, **kwargs):
for repository in Repository.objects.filter(credentials__isnull=False):
logger.info(f"Updating {repository}")
self.stdout.write(f"Updating {repository}")
repository.provider.create_hook(repository, base_url=base_url)
logger.info("All done")
self.stdout.write(self.style.SUCCESS("All done"))
import logging
from django.core.management.base import BaseCommand
from arkindex.dataimport.models import Repository
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger()
class Command(BaseCommand):
help = "Update all Git repositories references and trigger build processes"
......
......@@ -3,15 +3,16 @@ from typing import Optional
from django.conf import settings
from django.db import transaction
from django_rq import job
from rq import Retry
from arkindex.dataimport.models import ActivityState, DataImport, WorkerActivity, WorkerActivityState
@job('default', timeout=settings.RQ_TIMEOUTS['initialize_activity'])
@job('default', timeout=settings.RQ_TIMEOUTS['initialize_activity'], retry=Retry(max=4))
def initialize_activity(process: DataImport):
"""
List all worker versions used in a process and initialize their activity on processed elements.
Timeout is set to 1 hour
4 retries allowed, for a total of 5 attempts, to try to mitigate some database errors from the large query.
"""
try:
with transaction.atomic():
......
from django.core.management import call_command
from django.core.management.base import CommandError
from arkindex.dataimport.models import Repository, RepositoryType, Revision, Worker, WorkerVersion, WorkerVersionState
from arkindex.project.tests import FixtureTestCase
......@@ -55,12 +56,13 @@ class TestFakeWorker(FixtureTestCase):
old_versions_id = list(old_versions.values_list('id', flat=True).distinct())
self.assertTrue(old_versions.exists())
call_command(
'fake_worker_version',
slug=slug,
name=name,
url=url,
)
with self.assertRaises(CommandError):
call_command(
'fake_worker_version',
slug=slug,
name=name,
url=url,
)
new_versions = WorkerVersion.objects.filter(worker__slug=slug)
new_versions_id = list(new_versions.values_list('id', flat=True).distinct())
......
......@@ -414,7 +414,7 @@ class TestRepositories(FixtureTestCase):
self.assertFalse(import_corpus.public)
self.assertEqual(import_corpus.name, 'IIIF import from http://gitlab/repo')
self.assertEqual(import_corpus.types.count(), 11)
self.assertEqual(import_corpus.types.count(), 6)
# User is granted an admin access to both the repository and the corpus
corpus_right = import_corpus.memberships.get(user=self.user)
......
......@@ -122,6 +122,15 @@ class TestWorkerRuns(FixtureAPITestCase):
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertEqual(response.json(), {'worker_version_id': ['This version does not exist.']})
def test_create_run_unique(self):
self.client.force_login(self.user)
response = self.client.post(
reverse('api:worker-run-list', kwargs={'pk': str(self.dataimport_1.id)}),
data={'worker_version_id': str(self.version_1.id), 'parents': []}, format='json'
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertEqual(response.json(), {'worker_version_id': ['A WorkerRun with this version already exists on this process.']})
def test_runs_post_invalid_dataimport_id(self):
self.client.force_login(self.user)
response = self.client.post(
......@@ -161,7 +170,7 @@ class TestWorkerRuns(FixtureAPITestCase):
def test_runs_post_create_worker_run(self):
self.client.force_login(self.user)
with self.assertNumQueries(14):
with self.assertNumQueries(15):
response = self.client.post(
reverse('api:worker-run-list', kwargs={'pk': str(self.dataimport_2.id)}),
data={'worker_version_id': str(self.version_1.id), 'parents': []}, format='json'
......@@ -187,7 +196,7 @@ class TestWorkerRuns(FixtureAPITestCase):
self.worker_1.memberships.filter(user=self.user).update(level=Role.Guest.value)
self.worker_1.repository.memberships.create(user=self.user, level=Role.Contributor.value)
self.client.force_login(self.user)
with self.assertNumQueries(15):
with self.assertNumQueries(16):
response = self.client.post(
reverse('api:worker-run-list', kwargs={'pk': str(self.dataimport_2.id)}),
data={'worker_version_id': str(self.version_1.id), 'parents': []}, format='json'
......@@ -210,7 +219,7 @@ class TestWorkerRuns(FixtureAPITestCase):
def test_create_run_configuration(self):
self.client.force_login(self.user)
with self.assertNumQueries(15):
with self.assertNumQueries(16):
response = self.client.post(
reverse('api:worker-run-list', kwargs={'pk': str(self.dataimport_2.id)}),
data={
......
......@@ -571,6 +571,17 @@ class ElementsListBase(CorpusACLMixin, DestroyModelMixin, ListAPIView):
if self.type_filter:
filters['type'] = self.type_filter
elif self.folder_filter is not None:
# When filtering for folder or non-folder elements, using only the type__folder filter
# can cause Postgres to retrieve all the {non-}folder types on every corpus
# This can reach hundreds of types as the database grows, so Postgres can end up using a Hash Join
# to handle joining this large amount of elements and types.
# Since Postgres estimates this to represent a large amount of rows, it might also use multi-processing,
# which has a very high overhead.
# This can be avoided by also filtering on the type's corpus: Postgres will then access the index
# on the type's corpus ID. The query planner's statistics will give it a very low estimation since there
# rarely are a ton of types in a corpus, so Postgres will also use the type_id index on elements, which
# will lower the amount of rows much more quickly, making it stop using multi-processing.
filters['type__corpus'] = self.selected_corpus
filters['type__folder'] = self.folder_filter
if 'worker_version' in self.clean_params:
......@@ -816,32 +827,22 @@ class ElementChildren(ElementsListBase):
Requires a **read** access to the element's corpus.
"""
@property
def is_recursive(self):
recursive_param = self.clean_params.get('recursive')
return recursive_param is not None and recursive_param.lower() not in ('false', '0')
@property
def selected_corpus(self):
return self.element.corpus
def get_filters(self):
filters = super().get_filters()
recursive_param = self.clean_params.get('recursive')
if recursive_param is None or recursive_param.lower() in ('false', '0'):
# Only list direct children, by listing all the direct paths from the current element
# This is faster for direct children than :
# - Element.objects.filter(paths__path__last=XXX)
# - Element.objects.get_descending()
# This code path is extremely used, as it's the basis for navigation amongst elements
# Be extremely careful when changing this behaviour.
paths = self.element.paths.values_list('path', flat=True)
if not paths:
# Support top level children
paths = [[]]
filters['paths__path__in'] = [
path + [self.kwargs['pk']]
for path in paths
]
else:
if self.is_recursive:
filters['paths__path__overlap'] = [self.kwargs['pk']]
else:
filters['paths__path__last'] = self.kwargs['pk']
return filters
......@@ -1148,6 +1149,16 @@ class CorpusList(ListCreateAPIView):
for c in corpora:
c.access_level = corpora_level[c.id]
# Avoid making any extra queries to retrieve the top_level_type.
# We already have prefetched the types, so we can use corpus.types.all()
# when a corpus has a top_level_type_id to retrieve it.
if c.top_level_type_id is not None:
c.top_level_type = next(
element_type
for element_type in c.types.all()
if element_type.id == c.top_level_type_id
)
return corpora
......@@ -1766,7 +1777,7 @@ class WorkerResultsDestroy(CorpusACLMixin, DestroyAPIView):
worker_results_delete(
corpus_id=corpus.id,
version=worker_version,
parent_id=element_id,
element_id=element_id,
user_id=self.request.user.id,
)
......
import logging
from uuid import UUID
from django.core.exceptions import ValidationError
......@@ -41,8 +40,6 @@ from arkindex.project.mixins import ACLMixin, CorpusACLMixin
from arkindex.project.permissions import IsVerified, IsVerifiedOrReadOnly
from arkindex.users.models import Role
logger = logging.getLogger(__name__)
@extend_schema(tags=['entities'])
@extend_schema_view(
......
import logging
import uuid
from django.db import transaction
......@@ -35,8 +34,6 @@ from arkindex.project.mixins import ACLMixin, CorpusACLMixin, SelectionMixin
from arkindex.project.permissions import IsVerified, IsVerifiedOrReadOnly
from arkindex.users.models import Role
logger = logging.getLogger(__name__)
@extend_schema_view(
post=extend_schema(
......
PRAGMA foreign_keys = ON;
CREATE TABLE export_version AS SELECT 2 AS version;
CREATE TABLE export_version AS SELECT 3 AS version;
CREATE TABLE image_server (
id VARCHAR(37) NOT NULL,
......@@ -114,12 +114,14 @@ CREATE TABLE transcription_entity (
offset INTEGER NOT NULL,
length INTEGER NOT NULL,
worker_version_id VARCHAR(37),
confidence REAL,
PRIMARY KEY (id),
FOREIGN KEY (transcription_id) REFERENCES transcription (id) ON DELETE CASCADE,
FOREIGN KEY (entity_id) REFERENCES entity (id) ON DELETE CASCADE,
FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE,
UNIQUE (transcription_id, entity_id, offset, length, worker_version_id),
CHECK (offset >= 0 AND length >= 0)
CHECK (offset >= 0 AND length >= 0),
CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1))
);
CREATE TABLE entity_role (
......
......@@ -6,7 +6,8 @@ SELECT
te.entity_id,
te.offset,
te.length,
te.worker_version_id
te.worker_version_id,
te.confidence
FROM documents_transcriptionentity te
INNER JOIN documents_entity entity ON (te.entity_id = entity.id)
WHERE entity.corpus_id = '{corpus_id}'::uuid
This diff is collapsed.
......@@ -7,12 +7,6 @@ from teklia_toolbox.time import Timer
from arkindex.documents.models import Element
# Enable deletion signal logs
logging.basicConfig(
level=logging.INFO,
format='[%(levelname)s] %(message)s',
)
def valid_uuid(value):
try:
......@@ -34,6 +28,9 @@ class Command(BaseCommand):
)
def handle(self, element_id, verbosity=0, **options):
# Enable deletion signal logs
logging.getLogger('arkindex.documents.delete').setLevel(logging.INFO)
for element in Element.objects.filter(id__in=element_id):
self.stdout.write(f"Deleting {element.id} : {element}")
with Timer() as t:
......