Compare revisions

Erwan Rouchet · Bastien Abadie · Bastien Abadie · Bastien Abadie · Erwan Rouchet · Bastien Abadie
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -11,7 +11,7 @@ include:

 # For jobs that run backend scripts directly
 .backend-setup:
-  image: registry.gitlab.com/arkindex/backend/base:django-4
+  image: registry.gitlab.com/arkindex/backend/base:django-4.0.2

  cache:
    paths:

--- a/Dockerfile
+++ b/Dockerfile
-FROM registry.gitlab.com/arkindex/backend/base:django-4 as build
+FROM registry.gitlab.com/arkindex/backend/base:django-4.0.2 as build

 RUN mkdir build
 ADD . build
 RUN cd build && python3 setup.py sdist

-FROM registry.gitlab.com/arkindex/backend/base:django-4
+FROM registry.gitlab.com/arkindex/backend/base:django-4.0.2
 ARG PONOS_BRANCH=master
 ARG PONOS_ID=10017043
 ARG TRANSKRIBUS_BRANCH=master

--- a/Dockerfile.binary
+++ b/Dockerfile.binary
@@ -61,7 +61,7 @@ RUN python -m nuitka \
      arkindex/manage.py

 # Start over from a clean setup
-FROM registry.gitlab.com/arkindex/backend/base:django-4 as build
+FROM registry.gitlab.com/arkindex/backend/base:django-4.0.2 as build

 # Import files from compilation
 RUN mkdir /usr/share/arkindex

--- a/VERSION
+++ b/VERSION
-1.2.0-beta3
+1.2.0
--- a/arkindex/dataimport/api.py
+++ b/arkindex/dataimport/api.py
@@ -1125,13 +1125,18 @@ class WorkerRunList(WorkerACLMixin, ListCreateAPIView):
        # User must have admin access to the process project and a contributor access to the worker run
        if not self.has_access(process.corpus, Role.Admin.value):
            raise PermissionDenied(detail='You do not have an admin access to the corpus of this process.')
+
        try:
            worker = Worker.objects.get(versions__id=serializer.validated_data['version_id'])
        except Worker.DoesNotExist:
            raise ValidationError({'worker_version_id': ['This version does not exist.']})
+
        if not self.has_execution_access(worker):
            raise ValidationError({'worker_version_id': ['You do not have an execution access to this version.']})

+        if process.worker_runs.filter(version_id=serializer.validated_data['version_id']).exists():
+            raise ValidationError({'worker_version_id': ['A WorkerRun with this version already exists on this process.']})
+
        configuration = serializer.validated_data.pop('configuration_id', None)
        if configuration and configuration.worker_id != worker.id:
            raise ValidationError({'configuration_id': ['The configuration must be part of the same worker.']})

--- a/arkindex/dataimport/management/commands/fake_worker_version.py
+++ b/arkindex/dataimport/management/commands/fake_worker_version.py
 #!/usr/bin/env python3
-import logging
 import uuid

-from django.core.management.base import BaseCommand
+from django.core.management.base import BaseCommand, CommandError

 from arkindex.dataimport.models import Repository, RepositoryType, Revision, Worker, WorkerVersion

-logging.basicConfig(
-    level=logging.INFO,
-    format='[%(levelname)s] %(message)s',
-)
-logger = logging.getLogger(__name__)
-

 class Command(BaseCommand):
    help = 'Create a fake worker version'
@@ -37,7 +30,7 @@ class Command(BaseCommand):
        versions = WorkerVersion.objects.filter(worker__slug=slug)
        if versions.exists():
            versions_id = list(versions.values_list('id', flat=True).distinct())
-            logger.info(f"This worker already has versions: {versions_id}")
+            raise CommandError(f"This worker already has versions: {versions_id}")
            return

        repo, _ = Repository.objects.get_or_create(
@@ -68,4 +61,4 @@ class Command(BaseCommand):
            configuration={"configuration": {}},
        )

-        logger.info(f"Created a worker version: {version.id}")
+        self.stdout.write(self.style.SUCCESS(f"Created a worker version: {version.id}"))
--- a/arkindex/dataimport/management/commands/import_s3.py
+++ b/arkindex/dataimport/management/commands/import_s3.py
 #!/usr/bin/env python3
-import logging
-
 import yaml
 from django.conf import settings
 from django.core.management.base import BaseCommand, CommandError
@@ -11,12 +9,6 @@ from arkindex.project.argparse import CorpusArgument
 from arkindex.users.models import User
 from ponos.models import Workflow

-logging.basicConfig(
-    level=logging.INFO,
-    format='[%(levelname)s] %(message)s',
-)
-logger = logging.getLogger(__name__)
-
 IMPORT_NAME = 'import-s3'
 MAX_DEPTH_WARN = 3

@@ -90,10 +82,9 @@ class Command(BaseCommand):
        # Warn for high recursion level
        depth = options['max_folder_depth']
        if depth > MAX_DEPTH_WARN:
-            logger.warning(
-                'Maximum folder depth set to {}. A high value can considerably slow tasks import distribution'
-                .format(depth)
-            )
+            self.stderr.write(self.style.WARNING(
+                f'Maximum folder depth set to {depth}. A high value can considerably slow tasks import distribution'
+            ))

        iiif_cache_bucket = options.get('iiif_cache_bucket')

@@ -146,7 +137,7 @@ class Command(BaseCommand):
        recipe.setdefault('env', {}).update(env_vars)

        workflow = Workflow.objects.create(farm_id=get_default_farm_id(), recipe=yaml.dump(recipe))
-        logger.info('Created Workflow with id {}'.format(workflow.id))
+        self.stdout.write('Created Workflow with id {}'.format(workflow.id))

        admin = User.objects.filter(is_admin=True).first()
        assert admin is not None, 'No admin user has been found to create a Dataimport'
@@ -157,7 +148,7 @@ class Command(BaseCommand):
            corpus_id=options['corpus'].id,
            creator_id=admin.id
        )
-        logger.info(
+        self.stdout.write(self.style.SUCCESS(
            'Linked Workflow to DataImport {0} using user {1.email} ({1.id})'
            .format(dataimport.id, admin)
-        )
+        ))
--- a/arkindex/dataimport/management/commands/update_repositories_hooks.py
+++ b/arkindex/dataimport/management/commands/update_repositories_hooks.py
-import logging
-
 from django.core.management.base import BaseCommand

 from arkindex.dataimport.models import Repository

-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] %(message)s",
-)
-logger = logging.getLogger()
-

 class Command(BaseCommand):
    help = "Update all Git repositories hooks"
@@ -23,7 +15,7 @@ class Command(BaseCommand):

    def handle(self, base_url, *args, **kwargs):
        for repository in Repository.objects.filter(credentials__isnull=False):
-            logger.info(f"Updating {repository}")
+            self.stdout.write(f"Updating {repository}")
            repository.provider.create_hook(repository, base_url=base_url)

-        logger.info("All done")
+        self.stdout.write(self.style.SUCCESS("All done"))
--- a/arkindex/dataimport/management/commands/update_repositories_refs.py
+++ b/arkindex/dataimport/management/commands/update_repositories_refs.py
-import logging
-
 from django.core.management.base import BaseCommand

 from arkindex.dataimport.models import Repository

-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] %(message)s",
-)
-logger = logging.getLogger()
-

 class Command(BaseCommand):
    help = "Update all Git repositories references and trigger build processes"

--- a/arkindex/dataimport/tasks.py
+++ b/arkindex/dataimport/tasks.py
@@ -3,15 +3,16 @@ from typing import Optional
 from django.conf import settings
 from django.db import transaction
 from django_rq import job
+from rq import Retry

 from arkindex.dataimport.models import ActivityState, DataImport, WorkerActivity, WorkerActivityState


-@job('default', timeout=settings.RQ_TIMEOUTS['initialize_activity'])
+@job('default', timeout=settings.RQ_TIMEOUTS['initialize_activity'], retry=Retry(max=4))
 def initialize_activity(process: DataImport):
    """
    List all worker versions used in a process and initialize their activity on processed elements.
-    Timeout is set to 1 hour
+    4 retries allowed, for a total of 5 attempts, to try to mitigate some database errors from the large query.
    """
    try:
        with transaction.atomic():

--- a/arkindex/dataimport/tests/commands/test_fake_worker_version.py
+++ b/arkindex/dataimport/tests/commands/test_fake_worker_version.py
 from django.core.management import call_command
+from django.core.management.base import CommandError

 from arkindex.dataimport.models import Repository, RepositoryType, Revision, Worker, WorkerVersion, WorkerVersionState
 from arkindex.project.tests import FixtureTestCase
@@ -55,12 +56,13 @@ class TestFakeWorker(FixtureTestCase):
        old_versions_id = list(old_versions.values_list('id', flat=True).distinct())
        self.assertTrue(old_versions.exists())

-        call_command(
-            'fake_worker_version',
-            slug=slug,
-            name=name,
-            url=url,
-        )
+        with self.assertRaises(CommandError):
+            call_command(
+                'fake_worker_version',
+                slug=slug,
+                name=name,
+                url=url,
+            )

        new_versions = WorkerVersion.objects.filter(worker__slug=slug)
        new_versions_id = list(new_versions.values_list('id', flat=True).distinct())

--- a/arkindex/dataimport/tests/test_repos.py
+++ b/arkindex/dataimport/tests/test_repos.py
@@ -414,7 +414,7 @@ class TestRepositories(FixtureTestCase):

        self.assertFalse(import_corpus.public)
        self.assertEqual(import_corpus.name, 'IIIF import from http://gitlab/repo')
-        self.assertEqual(import_corpus.types.count(), 11)
+        self.assertEqual(import_corpus.types.count(), 6)

        # User is granted an admin access to both the repository and the corpus
        corpus_right = import_corpus.memberships.get(user=self.user)

--- a/arkindex/dataimport/tests/test_workerruns.py
+++ b/arkindex/dataimport/tests/test_workerruns.py
@@ -122,6 +122,15 @@ class TestWorkerRuns(FixtureAPITestCase):
        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
        self.assertEqual(response.json(), {'worker_version_id': ['This version does not exist.']})

+    def test_create_run_unique(self):
+        self.client.force_login(self.user)
+        response = self.client.post(
+            reverse('api:worker-run-list', kwargs={'pk': str(self.dataimport_1.id)}),
+            data={'worker_version_id': str(self.version_1.id), 'parents': []}, format='json'
+        )
+        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
+        self.assertEqual(response.json(), {'worker_version_id': ['A WorkerRun with this version already exists on this process.']})
+
    def test_runs_post_invalid_dataimport_id(self):
        self.client.force_login(self.user)
        response = self.client.post(
@@ -161,7 +170,7 @@ class TestWorkerRuns(FixtureAPITestCase):

    def test_runs_post_create_worker_run(self):
        self.client.force_login(self.user)
-        with self.assertNumQueries(14):
+        with self.assertNumQueries(15):
            response = self.client.post(
                reverse('api:worker-run-list', kwargs={'pk': str(self.dataimport_2.id)}),
                data={'worker_version_id': str(self.version_1.id), 'parents': []}, format='json'
@@ -187,7 +196,7 @@ class TestWorkerRuns(FixtureAPITestCase):
        self.worker_1.memberships.filter(user=self.user).update(level=Role.Guest.value)
        self.worker_1.repository.memberships.create(user=self.user, level=Role.Contributor.value)
        self.client.force_login(self.user)
-        with self.assertNumQueries(15):
+        with self.assertNumQueries(16):
            response = self.client.post(
                reverse('api:worker-run-list', kwargs={'pk': str(self.dataimport_2.id)}),
                data={'worker_version_id': str(self.version_1.id), 'parents': []}, format='json'
@@ -210,7 +219,7 @@ class TestWorkerRuns(FixtureAPITestCase):

    def test_create_run_configuration(self):
        self.client.force_login(self.user)
-        with self.assertNumQueries(15):
+        with self.assertNumQueries(16):
            response = self.client.post(
                reverse('api:worker-run-list', kwargs={'pk': str(self.dataimport_2.id)}),
                data={

--- a/arkindex/documents/api/elements.py
+++ b/arkindex/documents/api/elements.py
@@ -571,6 +571,17 @@ class ElementsListBase(CorpusACLMixin, DestroyModelMixin, ListAPIView):
        if self.type_filter:
            filters['type'] = self.type_filter
        elif self.folder_filter is not None:
+            # When filtering for folder or non-folder elements, using only the type__folder filter
+            # can cause Postgres to retrieve all the {non-}folder types on every corpus
+            # This can reach hundreds of types as the database grows, so Postgres can end up using a Hash Join
+            # to handle joining this large amount of elements and types.
+            # Since Postgres estimates this to represent a large amount of rows, it might also use multi-processing,
+            # which has a very high overhead.
+            # This can be avoided by also filtering on the type's corpus: Postgres will then access the index
+            # on the type's corpus ID.  The query planner's statistics will give it a very low estimation since there
+            # rarely are a ton of types in a corpus, so Postgres will also use the type_id index on elements, which
+            # will lower the amount of rows much more quickly, making it stop using multi-processing.
+            filters['type__corpus'] = self.selected_corpus
            filters['type__folder'] = self.folder_filter

        if 'worker_version' in self.clean_params:
@@ -816,32 +827,22 @@ class ElementChildren(ElementsListBase):
    Requires a **read** access to the element's corpus.
    """

+    @property
+    def is_recursive(self):
+        recursive_param = self.clean_params.get('recursive')
+        return recursive_param is not None and recursive_param.lower() not in ('false', '0')
+
    @property
    def selected_corpus(self):
        return self.element.corpus

    def get_filters(self):
        filters = super().get_filters()
-        recursive_param = self.clean_params.get('recursive')

-        if recursive_param is None or recursive_param.lower() in ('false', '0'):
-            # Only list direct children, by listing all the direct paths from the current element
-            # This is faster for direct children than :
-            # - Element.objects.filter(paths__path__last=XXX)
-            # - Element.objects.get_descending()
-            # This code path is extremely used, as it's the basis for navigation amongst elements
-            # Be extremely careful when changing this behaviour.
-            paths = self.element.paths.values_list('path', flat=True)
-            if not paths:
-                # Support top level children
-                paths = [[]]
-
-            filters['paths__path__in'] = [
-                path + [self.kwargs['pk']]
-                for path in paths
-            ]
-        else:
+        if self.is_recursive:
            filters['paths__path__overlap'] = [self.kwargs['pk']]
+        else:
+            filters['paths__path__last'] = self.kwargs['pk']

        return filters

@@ -1148,6 +1149,16 @@ class CorpusList(ListCreateAPIView):
        for c in corpora:
            c.access_level = corpora_level[c.id]

+            # Avoid making any extra queries to retrieve the top_level_type.
+            # We already have prefetched the types, so we can use corpus.types.all()
+            # when a corpus has a top_level_type_id to retrieve it.
+            if c.top_level_type_id is not None:
+                c.top_level_type = next(
+                    element_type
+                    for element_type in c.types.all()
+                    if element_type.id == c.top_level_type_id
+                )
+
        return corpora


@@ -1766,7 +1777,7 @@ class WorkerResultsDestroy(CorpusACLMixin, DestroyAPIView):
        worker_results_delete(
            corpus_id=corpus.id,
            version=worker_version,
-            parent_id=element_id,
+            element_id=element_id,
            user_id=self.request.user.id,
        )


--- a/arkindex/documents/api/entities.py
+++ b/arkindex/documents/api/entities.py
-import logging
 from uuid import UUID

 from django.core.exceptions import ValidationError
@@ -41,8 +40,6 @@ from arkindex.project.mixins import ACLMixin, CorpusACLMixin
 from arkindex.project.permissions import IsVerified, IsVerifiedOrReadOnly
 from arkindex.users.models import Role

-logger = logging.getLogger(__name__)
-

 @extend_schema(tags=['entities'])
 @extend_schema_view(

--- a/arkindex/documents/api/ml.py
+++ b/arkindex/documents/api/ml.py
-import logging
 import uuid

 from django.db import transaction
@@ -35,8 +34,6 @@ from arkindex.project.mixins import ACLMixin, CorpusACLMixin, SelectionMixin
 from arkindex.project.permissions import IsVerified, IsVerifiedOrReadOnly
 from arkindex.users.models import Role

-logger = logging.getLogger(__name__)
-

 @extend_schema_view(
    post=extend_schema(

--- a/arkindex/documents/export/structure.sql
+++ b/arkindex/documents/export/structure.sql
 PRAGMA foreign_keys = ON;

-CREATE TABLE export_version AS SELECT 2 AS version;
+CREATE TABLE export_version AS SELECT 3 AS version;

 CREATE TABLE image_server (
    id VARCHAR(37) NOT NULL,
@@ -114,12 +114,14 @@ CREATE TABLE transcription_entity (
    offset INTEGER NOT NULL,
    length INTEGER NOT NULL,
    worker_version_id VARCHAR(37),
+    confidence REAL,
    PRIMARY KEY (id),
    FOREIGN KEY (transcription_id) REFERENCES transcription (id) ON DELETE CASCADE,
    FOREIGN KEY (entity_id) REFERENCES entity (id) ON DELETE CASCADE,
    FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE,
    UNIQUE (transcription_id, entity_id, offset, length, worker_version_id),
-    CHECK (offset >= 0 AND length >= 0)
+    CHECK (offset >= 0 AND length >= 0),
+    CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1))
 );

 CREATE TABLE entity_role (

--- a/arkindex/documents/export/transcription_entity.sql
+++ b/arkindex/documents/export/transcription_entity.sql
@@ -6,7 +6,8 @@ SELECT
    te.entity_id,
    te.offset,
    te.length,
-    te.worker_version_id
+    te.worker_version_id,
+    te.confidence
 FROM documents_transcriptionentity te
 INNER JOIN documents_entity entity ON (te.entity_id = entity.id)
 WHERE entity.corpus_id = '{corpus_id}'::uuid
--- a/arkindex/documents/fixtures/data.json
+++ b/arkindex/documents/fixtures/data.json
--- a/arkindex/documents/management/commands/delete.py
+++ b/arkindex/documents/management/commands/delete.py
@@ -7,12 +7,6 @@ from teklia_toolbox.time import Timer

 from arkindex.documents.models import Element

-# Enable deletion signal logs
-logging.basicConfig(
-    level=logging.INFO,
-    format='[%(levelname)s] %(message)s',
-)
-

 def valid_uuid(value):
    try:
@@ -34,6 +28,9 @@ class Command(BaseCommand):
        )

    def handle(self, element_id, verbosity=0, **options):
+        # Enable deletion signal logs
+        logging.getLogger('arkindex.documents.delete').setLevel(logging.INFO)
+
        for element in Element.objects.filter(id__in=element_id):
            self.stdout.write(f"Deleting {element.id} : {element}")
            with Timer() as t:
No results found