Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • arkindex/backend
1 result
Show changes
Commits on Source (41)
Showing
with 972 additions and 1194 deletions
repos:
- repo: https://github.com/pre-commit/mirrors-isort
rev: v5.10.1
hooks:
- id: isort
- repo: https://github.com/pycqa/flake8
rev: 3.9.2
hooks:
- id: flake8
additional_dependencies:
- 'flake8-copyright==0.2.2'
- 'flake8-debugger==3.1.0'
- 'flake8-quotes==3.3.2'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.11
# Ruff version.
rev: v0.3.7
hooks:
# Run the linter.
- id: ruff
args: [--fix]
- repo: https://github.com/pre-commit/pre-commit-hooks
......
......@@ -16,7 +16,7 @@ build:
CI_PROJECT_DIR=$(ROOT_DIR) CI_REGISTRY_IMAGE=$(IMAGE_TAG) $(ROOT_DIR)/ci/build.sh
worker:
arkindex rqworker -v 2 default high tasks
arkindex rqworker -v 2 default high tasks export
test-fixtures:
$(eval export PGPASSWORD=devdata)
......@@ -54,7 +54,7 @@ release:
clean-docker:
$(eval containers:=$(shell docker ps -a -q))
@if [ -n "$(containers)" ]; then \
echo "Cleaning up past containers\n" \
echo "Cleaning up past containers\n" ; \
docker rm -f $(containers) ; \
fi
......
1.6.0-beta3
1.6.1-beta2
......@@ -11,8 +11,6 @@ from arkindex.documents.models import (
Element,
ElementType,
Entity,
EntityLink,
EntityRole,
EntityType,
MetaData,
MLClass,
......@@ -135,29 +133,15 @@ class EntityMetaForm(forms.ModelForm):
metas = HStoreFormField()
class EntityLinkInLine(admin.TabularInline):
model = EntityLink
fk_name = "parent"
raw_id_fields = ("child", )
class EntityAdmin(admin.ModelAdmin):
list_display = ("id", "name", "type")
list_filter = ["corpus", "type"]
readonly_fields = ("id", )
raw_id_fields = ("worker_version", "worker_run", )
search_fields = ("name", )
inlines = (EntityLinkInLine, )
form = EntityMetaForm
class EntityRoleAdmin(admin.ModelAdmin):
list_display = ("id", "corpus", "parent_name", "child_name")
list_filter = ("corpus", )
readonly_fields = ("id", )
ordering = ("corpus", "parent_name", "child_name")
class EntityTypeAdmin(admin.ModelAdmin):
list_display = ("id", "corpus", "name", "color")
list_filter = ("corpus", )
......@@ -180,7 +164,6 @@ admin.site.register(Transcription, TranscriptionAdmin)
admin.site.register(MLClass, MLClassAdmin)
admin.site.register(MetaData, MetaDataAdmin)
admin.site.register(Entity, EntityAdmin)
admin.site.register(EntityRole, EntityRoleAdmin)
admin.site.register(EntityType, EntityTypeAdmin)
admin.site.register(AllowedMetaData, AllowedMetaDataAdmin)
admin.site.register(CorpusExport, CorpusExportAdmin)
......@@ -3,32 +3,18 @@ from textwrap import dedent
from uuid import UUID
from django.core.exceptions import ValidationError as DjangoValidationError
from django.db.models import Q
from django.shortcuts import get_object_or_404
from drf_spectacular.utils import OpenApiExample, OpenApiParameter, OpenApiResponse, extend_schema, extend_schema_view
from drf_spectacular.utils import OpenApiParameter, OpenApiResponse, extend_schema, extend_schema_view
from rest_framework import permissions, serializers, status
from rest_framework.exceptions import NotFound, PermissionDenied, ValidationError
from rest_framework.generics import CreateAPIView, ListAPIView, ListCreateAPIView, RetrieveUpdateDestroyAPIView
from rest_framework.generics import CreateAPIView, ListAPIView, RetrieveUpdateDestroyAPIView
from rest_framework.response import Response
from arkindex.documents.models import (
Corpus,
Element,
Entity,
EntityLink,
EntityRole,
EntityType,
Transcription,
TranscriptionEntity,
)
from arkindex.documents.models import Corpus, Element, Entity, EntityType, Transcription, TranscriptionEntity
from arkindex.documents.serializers.elements import ElementTinySerializer
from arkindex.documents.serializers.entities import (
BaseEntitySerializer,
CreateEntityRoleErrorResponseSerializer,
EntityCreateSerializer,
EntityLinkCreateSerializer,
EntityLinkSerializer,
EntityRoleSerializer,
EntitySerializer,
EntityTypeCreateSerializer,
EntityTypeSerializer,
......@@ -44,53 +30,6 @@ from arkindex.project.permissions import IsVerified, IsVerifiedOrReadOnly
from arkindex.users.models import Role
@extend_schema(tags=["entities"])
@extend_schema_view(
get=extend_schema(operation_id="ListCorpusRoles", description="List all roles of a corpus"),
post=extend_schema(
description="Create a new entity role",
responses={
200: EntityRoleSerializer,
400: CreateEntityRoleErrorResponseSerializer
},
examples=[OpenApiExample(
status_codes=["400"],
response_only=True,
name="role-exists",
value={"id": "55cd009d-cd4b-4ec2-a475-b060f98f9138", "corpus": ["Role already exists in this corpus"]},
description="Role already exists."
)]
)
)
class CorpusRoles(CorpusACLMixin, ListCreateAPIView):
"""
List all roles in a corpus
"""
permission_classes = (IsVerifiedOrReadOnly, )
serializer_class = EntityRoleSerializer
queryset = EntityRole.objects.none()
def get_queryset(self):
return EntityRole.objects \
.filter(corpus=self.get_corpus(self.kwargs["pk"])) \
.order_by("parent_name", "child_name")
def perform_create(self, serializer):
data = self.request.data
if EntityRole.objects.filter(
parent_name=data["parent_name"],
child_name=data["child_name"],
parent_type=data["parent_type_id"],
child_type=data["child_type_id"],
corpus_id=self.request.parser_context["kwargs"]["pk"]
).exists():
raise serializers.ValidationError({
"corpus": ["Role already exists in this corpus"],
"id": self.request.parser_context["kwargs"]["pk"]
})
super().perform_create(serializer)
@extend_schema(tags=["entities"])
@extend_schema_view(
get=extend_schema(operation_id="ListCorpusEntityTypes", description="List all entity types in a corpus"),
......@@ -173,8 +112,6 @@ class EntityTypeUpdate(ACLMixin, RetrieveUpdateDestroyAPIView):
def perform_destroy(self, instance):
if instance.entities.exists():
raise ValidationError({"detail": ["Some entities are using this entity type."]})
if EntityRole.objects.filter(Q(parent_type_id=instance.id) | Q(child_type_id=instance.id)).exists():
raise ValidationError({"detail": ["Some entity roles are using this entity type."]})
super().perform_destroy(instance)
......@@ -196,14 +133,6 @@ class EntityDetails(ACLMixin, RetrieveUpdateDestroyAPIView):
.select_related("corpus", "type") \
.filter(corpus__in=Corpus.objects.readable(self.request.user)) \
.prefetch_related(
"parents__role__parent_type",
"parents__role__child_type",
"children__role__parent_type",
"children__role__child_type",
"parents__child__type",
"parents__parent__type",
"children__parent__type",
"children__child__type",
"corpus",
)
......@@ -307,15 +236,6 @@ class EntityCreate(CreateAPIView):
return Response(entity.data, status=status_code, headers=headers)
@extend_schema_view(post=extend_schema(operation_id="CreateEntityLink", tags=["entities"]))
class EntityLinkCreate(CreateAPIView):
"""
Create a new link between two entities with a role
"""
permission_classes = (IsVerified, )
serializer_class = EntityLinkCreateSerializer
@extend_schema_view(post=extend_schema(
operation_id="CreateTranscriptionEntity",
tags=["entities"],
......@@ -519,41 +439,6 @@ class CorpusEntities(CorpusACLMixin, ListAPIView):
return queryset
@extend_schema_view(get=extend_schema(operation_id="ListElementLinks", tags=["entities"]))
class ElementLinks(CorpusACLMixin, ListAPIView):
"""
List all links where parent and child are linked to the element.\n\n
Requires a **guest** access to the element corpus
"""
serializer_class = EntityLinkSerializer
def get_queryset(self):
try:
element = Element.objects.select_related("corpus").only("id", "corpus").get(id=self.kwargs["pk"])
except Element.DoesNotExist:
raise NotFound
if not self.has_read_access(element.corpus):
raise PermissionDenied(detail="You do not have access to this element.")
# Load entities linked by transcriptions
entities_tr = Entity.objects.filter(transcriptions__element_id=element.id).prefetch_related("transcriptions")
# Load entities linked by metadatas
entities_meta = Entity.objects.filter(metadatas__element_id=element.id).prefetch_related("metadatas")
# Now load all links belonging to those entities
# It's several times faster to combine the queries in the final one
# than combining them at the lower level (entities is slower than entities_tr + entities_meta)
# We need to support cross references between transcriptions & metadata entities
return EntityLink.objects.filter(
Q(parent__in=entities_tr, child__in=entities_tr)
| Q(parent__in=entities_tr, child__in=entities_meta)
| Q(parent__in=entities_meta, child__in=entities_tr)
| Q(parent__in=entities_meta, child__in=entities_meta)
).select_related("role", "child__type", "parent__type").order_by("parent__name")
@extend_schema_view(
post=extend_schema(
operation_id="CreateTranscriptionEntities",
......
import uuid
from textwrap import dedent
from django.db import transaction
from django.utils.functional import cached_property
......@@ -300,16 +301,22 @@ class CorpusMLClassPagination(PageNumberPagination):
@extend_schema_view(
get=extend_schema(
operation_id="ListCorpusMLClasses",
description=dedent("""
List available classes in a corpus.
Requires a **guest** access to the corpus.
"""),
),
post=extend_schema(
operation_id="CreateMLClass",
description="Create an ML class in a corpus",
description=dedent("""
Create an ML class in a corpus.
Requires an **admin** access to the corpus.
"""),
)
)
class CorpusMLClassList(CorpusACLMixin, ListCreateAPIView):
"""
List available classes in a corpus
"""
serializer_class = MLClassSerializer
pagination_class = CorpusMLClassPagination
# For OpenAPI type discovery: a corpus ID is in the path
......@@ -322,7 +329,7 @@ class CorpusMLClassList(CorpusACLMixin, ListCreateAPIView):
def corpus(self):
role = Role.Guest
if self.request.method == "POST":
role = Role.Contributor
role = Role.Admin
return self.get_corpus(self.kwargs["pk"], role=role)
def check_permissions(self, *args, **kwargs):
......@@ -357,10 +364,26 @@ class CorpusMLClassList(CorpusACLMixin, ListCreateAPIView):
@extend_schema(tags=["classifications"])
@extend_schema_view(
get=extend_schema(description="Retrieve a ML class."),
patch=extend_schema(description="Rename a ML class."),
put=extend_schema(description="Rename a ML class."),
delete=extend_schema(description="Delete a ML class if it is not used by any classification."),
get=extend_schema(description=dedent("""
Retrieve an ML class.
Requires a **guest** access to the corpus.
""")),
patch=extend_schema(description=dedent("""
Rename an ML class.
Requires an **admin** access to the corpus.
""")),
put=extend_schema(description=dedent("""
Rename an ML class.
Requires an **admin** access to the corpus.
""")),
delete=extend_schema(description=dedent("""
Delete an ML class if it is not used by any classification.
Requires an **admin** access to the corpus.
""")),
)
class MLClassRetrieve(CorpusACLMixin, RetrieveUpdateDestroyAPIView):
serializer_class = MLClassSerializer
......@@ -372,7 +395,7 @@ class MLClassRetrieve(CorpusACLMixin, RetrieveUpdateDestroyAPIView):
def corpus(self):
role = Role.Guest
if self.request and self.request.method != "GET":
role = Role.Contributor
role = Role.Admin
return self.get_corpus(self.kwargs["corpus"], role=role)
......@@ -440,7 +463,7 @@ class ManageClassificationsSelection(SelectionMixin, CorpusACLMixin, CreateAPIVi
mode = serializer.validated_data["mode"]
if mode == ClassificationMode.Create:
return self.create(corpus, request, *args, **kwargs)
elif mode == ClassificationMode.Validate:
if mode == ClassificationMode.Validate:
elements = self.get_selection(corpus.id)
Classification.objects.filter(
element__in=elements,
......
......@@ -6,7 +6,7 @@ from arkindex.documents.dates import DateType, InterpretedDate
logger = logging.getLogger(__name__)
# Months (unaccented, lowercase)
# Months, unaccented, lowercase
MONTHS = {
"en": (
"january",
......@@ -143,7 +143,7 @@ def instanciate_date(date_elt):
try:
date.validate()
except ValueError as e:
logger.warning("Date fields are incorrect: {}".format(e))
logger.warning(f"Date fields are incorrect: {e}")
raise
return date
......@@ -161,6 +161,6 @@ def parse_date(raw_date, functions_table=DATE_FUNCTIONS_TABLE):
if date_elts:
return tuple(map(instanciate_date, date_elts))
except Exception:
logger.warning("Failed parsing {} with function {}".format(raw_date, f.__name__))
logger.warning("Date not supported: {}".format(raw_date))
logger.warning(f"Failed parsing {raw_date} with function {f.__name__}")
logger.warning(f"Date not supported: {raw_date}")
return ()
......@@ -16,7 +16,7 @@ class DatePrecision(Enum):
Day = "d"
class InterpretedDate(object):
class InterpretedDate:
def __init__(self, year, month=None, day=None, type=DateType.Exact):
self.year = int(year)
......@@ -26,17 +26,17 @@ class InterpretedDate(object):
def validate(self):
if self.year < 0:
raise ValueError("Year {} is negative".format(self.year))
raise ValueError(f"Year {self.year} is negative")
if self.month and (self.month < 1 or self.month > 12):
raise ValueError("Month {} is not between 1 and 12".format(self.month))
raise ValueError(f"Month {self.month} is not between 1 and 12")
if self.day and (self.day < 1 or self.day > 31):
raise ValueError("Day {} is not between 1 and 31".format(self.day))
raise ValueError(f"Day {self.day} is not between 1 and 31")
# Check if day is correct depending on year and month
if self.precision == DatePrecision.Day:
try:
datetime(*tuple(self))
except ValueError:
raise ValueError("Date format is incorrect {}".format(self))
raise ValueError(f"Date format is incorrect {self}")
@property
def precision(self):
......@@ -45,7 +45,7 @@ class InterpretedDate(object):
"""
if self.month and self.day:
return DatePrecision.Day
elif self.month:
if self.month:
return DatePrecision.Month
return DatePrecision.Year
......@@ -70,10 +70,10 @@ class InterpretedDate(object):
return s > o
def __str__(self):
return "-".join("{:02d}".format(e) for e in tuple(self) if e)
return "-".join(f"{e:02d}" for e in tuple(self) if e)
class InterpretedDateMixin(object):
class InterpretedDateMixin:
"""
Adds on-demand date parsing from a text field to InterpretedDates.
Requires a `raw_dates` property that returns the date string.
......
......@@ -38,8 +38,6 @@ EXPORT_QUERIES = [
"entity_type",
"entity",
"transcription_entity",
"entity_role",
"entity_link",
"metadata",
"dataset",
"dataset_element",
......@@ -120,7 +118,7 @@ def send_email(subject, template_name, corpus_export, **context):
logger.error(f"Failed to send email to {corpus_export.user.email}")
@job("high", timeout=settings.RQ_TIMEOUTS["export_corpus"])
@job("export", timeout=settings.RQ_TIMEOUTS["export_corpus"])
def export_corpus(corpus_export: CorpusExport) -> None:
_, db_path = tempfile.mkstemp(suffix=".db")
try:
......
SELECT link.id, link.parent_id, link.child_id, link.role_id
FROM documents_entitylink link
INNER JOIN documents_entityrole role ON (link.role_id = role.id)
WHERE role.corpus_id = '{corpus_id}'::uuid
SELECT id, parent_name, child_name, parent_type_id, child_type_id
FROM documents_entityrole
WHERE corpus_id = '{corpus_id}'::uuid
......@@ -26,13 +26,6 @@ CREATE INDEX transcription_entity_entity_id ON transcription_entity (entity_id);
CREATE INDEX transcription_entity_worker_version_id ON transcription_entity (worker_version_id);
CREATE INDEX transcription_entity_worker_run_id ON transcription_entity (worker_run_id);
CREATE INDEX entity_link_parent_id ON entity_link (parent_id);
CREATE INDEX entity_link_child_id ON entity_link (child_id);
CREATE INDEX entity_link_role_id ON entity_link (role_id);
CREATE INDEX entity_role_parent_type_id ON entity_role (parent_type_id);
CREATE INDEX entity_role_child_type_id ON entity_role (child_type_id);
CREATE INDEX metadata_element_id ON metadata (element_id);
CREATE INDEX metadata_entity_id ON metadata (entity_id);
CREATE INDEX metadata_worker_version_id ON metadata (worker_version_id);
......
PRAGMA foreign_keys = ON;
CREATE TABLE export_version AS SELECT 8 AS version;
CREATE TABLE export_version AS SELECT 9 AS version;
CREATE TABLE image_server (
id INTEGER NOT NULL,
......@@ -168,29 +168,6 @@ CREATE TABLE transcription_entity (
CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL)
);
CREATE TABLE entity_role (
id VARCHAR(37) NOT NULL,
parent_name VARCHAR(250) NOT NULL,
child_name VARCHAR(250) NOT NULL,
parent_type_id VARCHAR(37) NOT NULL,
child_type_id VARCHAR(37) NOT NULL,
PRIMARY KEY (id),
FOREIGN KEY (parent_type_id) REFERENCES entity_type (id) ON DELETE CASCADE,
FOREIGN KEY (child_type_id) REFERENCES entity_type (id) ON DELETE CASCADE,
UNIQUE (parent_name, child_name, parent_type_id, child_type_id)
);
CREATE TABLE entity_link (
id VARCHAR(37) NOT NULL,
parent_id VARCHAR(37) NOT NULL,
child_id VARCHAR(37) NOT NULL,
role_id VARCHAR(37) NOT NULL,
PRIMARY KEY (id),
FOREIGN KEY (parent_id) REFERENCES entity (id),
FOREIGN KEY (child_id) REFERENCES entity (id),
FOREIGN KEY (role_id) REFERENCES entity_role (id)
);
CREATE TABLE metadata (
id VARCHAR(37) NOT NULL,
element_id VARCHAR(37) NOT NULL,
......
This diff is collapsed.
......@@ -59,7 +59,7 @@ INNER JOIN documents_elementtype elementtype ON (element.type_id = elementtype.i
"""
class Indexer(object):
class Indexer:
# The query yielding all the elements to run on will look for all the child elements of all indexable elements
# The joins can take a very long time, so the query gets split into one to fetch all the indexable elements,
......
......@@ -6,12 +6,10 @@ from django.core.management.base import BaseCommand
from django.db import transaction
from django.db.models import Q
from django.db.utils import IntegrityError
from rest_framework.authtoken.models import Token
from arkindex.images.models import ImageServer
from arkindex.ponos.models import Farm
from arkindex.process.models import FeatureUsage, Repository, Worker, WorkerType, WorkerVersion, WorkerVersionState
from arkindex.users.models import User
# Constants used in architecture project
UPLOADS_IMAGE_SERVER_ID = 12345
......@@ -30,7 +28,6 @@ IMPORT_WORKER_SLUG = "file_import"
IMPORT_WORKER_REPO = "https://gitlab.teklia.com/arkindex/tasks"
IMPORT_WORKER_REVISION_MESSAGE = "File import worker bootstrap"
IMPORT_WORKER_REVISION_AUTHOR = "Dev Bootstrap"
ADMIN_API_TOKEN = "deadbeefTestToken"
class Command(BaseCommand):
......@@ -48,15 +45,6 @@ class Command(BaseCommand):
"""Helper to display error messages"""
self.stdout.write(self.style.ERROR(f"{msg}"))
def check_user(self, user):
"""Ensure a user is admin"""
if user.is_admin:
self.success(f"Admin user for legacy worker API tokens {user} is valid")
else:
user.is_admin = True
user.save()
self.warn(f"Updated user {user} to admin")
def create_image_server(self, id, url, bucket, region, display_name):
try:
server = ImageServer.objects.get(Q(id=id) | Q(url=url))
......@@ -129,29 +117,6 @@ class Command(BaseCommand):
)
self.success("Ponos farm created")
# An admin API user with a specific token
try:
token = Token.objects.get(key=ADMIN_API_TOKEN)
self.check_user(token.user)
except Token.DoesNotExist:
# Create a new internal user
user, _ = User.objects.get_or_create(
email="internal+bootstrap@teklia.com",
defaults={
"display_name": "Bootstrap Admin user",
"is_admin": True,
}
)
self.success("Created internal user")
self.check_user(user)
# Finally create a specific token for that user
if hasattr(user, "auth_token"):
# Support One-To-One relation
user.auth_token.delete()
Token.objects.create(key=ADMIN_API_TOKEN, user=user)
self.success(f"Created token {ADMIN_API_TOKEN}")
# an image server for local cantaloupe https://ark.localhost/iiif/2
uploads_server = self.create_image_server(UPLOADS_IMAGE_SERVER_ID , UPLOADS_IMAGE_SERVER_URL, UPLOADS_IMAGE_SERVER_BUCKET , UPLOADS_IMAGE_SERVER_REGION , "Local IIIF server for user uploaded files through frontend")
if uploads_server is None:
......
......@@ -49,7 +49,7 @@ class Command(BaseCommand):
img5 = Image.objects.create(path="img5", width=1000, height=1000, server=imgsrv)
img6 = Image.objects.create(path="img6", width=1000, height=1000, server=imgsrv)
# Create an admin, an internal and a normal user
# Create an admin and a normal user
superuser = User.objects.create_superuser("root@root.fr", "Pa$$w0rd", display_name="Admin")
superuser.verified_email = True
superuser.save()
......
......@@ -14,10 +14,21 @@ from rq.utils import as_text
from arkindex.documents.models import CorpusExport, CorpusExportState, Element
from arkindex.images.models import Image, ImageServer
from arkindex.ponos.models import Artifact, Task
from arkindex.process.models import DataFile, GitRef, GitRefType, Process, WorkerVersion, WorkerVersionState
from arkindex.process.models import (
CorpusWorkerVersion,
DataFile,
GitRef,
GitRefType,
Process,
Worker,
WorkerActivity,
WorkerRun,
WorkerVersion,
WorkerVersionState,
)
from arkindex.project.aws import s3
from arkindex.project.rq_overrides import Job
from arkindex.training.models import ModelVersion
from arkindex.training.models import Model, ModelVersion
from redis.exceptions import ConnectionError
# Ponos artifacts use the path: <task id>/<path>
......@@ -32,6 +43,9 @@ class Command(BaseCommand):
help = "Clean up old corpus exports, trashed DataFiles, expired processes and S3 buckets"
def handle(self, *args, **options):
# Cleaning up workers could free some artifacts, so clean them before artifacts
self.cleanup_archived_workers()
self.cleanup_artifacts()
self.cleanup_expired_processes()
......@@ -48,6 +62,8 @@ class Command(BaseCommand):
self.cleanup_ponos_logs()
self.cleanup_archived_models()
self.cleanup_unlinked_model_versions()
self.cleanup_rq_user_registries()
......@@ -294,6 +310,71 @@ class Command(BaseCommand):
self.stdout.write(self.style.SUCCESS("Successfully cleaned up orphaned Ponos logs."))
def cleanup_archived_workers(self):
"""
Remove Worker instances that have been archived for longer than the configured worker cleanup delay
and that are not being used in any worker result.
"""
self.stdout.write("Removing archived workers…")
workers = Worker.objects.filter(archived__lte=timezone.now() - timedelta(days=settings.WORKER_CLEANUP_DELAY))
skipped, deleted = 0, 0
for worker in workers.iterator():
# There are both foreign keys for worker versions and worker runs on worker results.
# Some old results might only have a worker version ID, but when a worker run ID is set,
# the worker version ID is deduced from it, so we only have to check on the version.
if worker.versions.all().in_use():
skipped += 1
continue
# Skip any workers whose WorkerConfigurations are in use.
# This should never happen since we already filter on the WorkerVersions,
# but that could lead to deleting worker results when we didn't want to.
if WorkerRun.objects.filter(configuration__worker=worker).in_use():
self.stdout.write(self.style.WARNING(
f"Worker {worker.name} ({worker.id}) does not have any worker versions used by worker results, "
"but some of its worker configurations are in use."
))
continue
self.stdout.write(f"Removing worker {worker.name} ({worker.id})")
worker.delete()
deleted += 1
if skipped:
self.stdout.write(f"Skipping {skipped} archived workers that have worker versions or configurations used in worker results.")
self.stdout.write(self.style.SUCCESS(f"Successfully cleaned up {deleted} archived workers."))
def cleanup_archived_models(self):
"""
Remove Model instances that have been archived for longer than the configured model cleanup delay
and that are not being used in any worker result.
"""
self.stdout.write("Removing archived models…")
models = Model.objects.filter(archived__lte=timezone.now() - timedelta(days=settings.MODEL_CLEANUP_DELAY))
skipped, deleted = 0, 0
for model in models.iterator():
if WorkerRun.objects.filter(model_version__model=model).in_use():
skipped += 1
continue
self.stdout.write(f"Removing model {model.name} ({model.id})")
# Remove CorpusWorkerVersions and WorkerActivities first
# Those normally use SET_NULL, but this can cause the unique constraints to complain
# if there already are rows with a model version set to None.
WorkerActivity.objects.filter(model_version__model=model).delete()
CorpusWorkerVersion.objects.filter(model_version__model=model).delete()
model.delete()
deleted += 1
if skipped:
self.stdout.write(f"Skipping {skipped} archived models that have model versions used in worker results.")
self.stdout.write(self.style.SUCCESS(f"Successfully cleaned up {deleted} archived models."))
def cleanup_unlinked_model_versions(self):
self.stdout.write("Removing orphaned model versions archives…")
bucket = s3.Bucket(settings.AWS_TRAINING_BUCKET)
......
......@@ -18,8 +18,6 @@ from arkindex.documents.models import (
ElementPath,
ElementType,
Entity,
EntityLink,
EntityRole,
EntityType,
MetaData,
MLClass,
......@@ -40,7 +38,8 @@ from arkindex.process.models import (
from arkindex.training.models import Dataset, DatasetElement, DatasetSet, Model
from arkindex.users.models import Role, User
EXPORT_VERSION = 8
EXPORT_VERSION_MIN = 8
EXPORT_VERSION_MAX = 9
TABLE_NAMES = {
"export_version",
......@@ -52,8 +51,6 @@ TABLE_NAMES = {
"element_path",
"entity",
"entity_type",
"entity_role",
"entity_link",
"transcription",
"transcription_entity",
"metadata",
......@@ -132,8 +129,6 @@ SQL_TOP_LEVEL_PATH_QUERY = """
SQL_ENTITY_QUERY = "SELECT * FROM entity"
SQL_ENTITY_TYPE_QUERY = "SELECT * FROM entity_type"
SQL_ENTITY_ROLE_QUERY = "SELECT * FROM entity_role"
SQL_ENTITY_LINK_QUERY = "SELECT * FROM entity_link"
SQL_TRANSCRIPTION_QUERY = "SELECT * FROM transcription"
SQL_TRANSCRIPTION_ENTITY_QUERY = "SELECT * FROM transcription_entity"
......@@ -249,24 +244,6 @@ class Command(BaseCommand):
corpus=self.corpus
)]
def convert_entity_roles(self, row):
return [EntityRole(
id=row["id"],
parent_name=row["parent_name"],
child_name=row["child_name"],
parent_type_id=row["parent_type_id"],
child_type_id=row["child_type_id"],
corpus=self.corpus
)]
def convert_entity_links(self, row):
return [EntityLink(
id=row["id"],
parent_id=row["parent_id"],
child_id=row["child_id"],
role_id=row["role_id"],
)]
def convert_transcriptions(self, row):
return [Transcription(
id=row["id"],
......@@ -543,13 +520,16 @@ class Command(BaseCommand):
# Check database tables
db_results = self.db.execute(SQL_TABLES_QUERY).fetchall()
if not set([table["name"] for table in db_results]) == TABLE_NAMES:
raise CommandError(f"The SQLite database {db_path} is not a correct Arkindex export")
# Database's tables must be a superset of TABLE_NAMES, so we keep compatibility when removing things
if (missing := TABLE_NAMES - set([table["name"] for table in db_results])):
raise CommandError(f"The SQLite database {db_path} is missing some expected tables: {sorted(missing)}")
# Check export version
db_results = self.db.execute(SQL_VERSION_QUERY).fetchall()
if len(db_results) != 1 or db_results[0]["version"] != EXPORT_VERSION:
raise CommandError(f"The SQLite database {db_path} does not have the correct export version")
if len(db_results) != 1 or not (
EXPORT_VERSION_MIN <= db_results[0]["version"] <= EXPORT_VERSION_MAX
):
raise CommandError(f"The SQLite database {db_path} does not have a supported export version")
# Retrieve corpus name
date = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M")
......@@ -597,11 +577,9 @@ class Command(BaseCommand):
self.bulk_create_objects(ElementPath, self.convert_element_paths, SQL_ELEMENT_PATH_QUERY, ignore_conflicts=False)
self.bulk_create_objects(ElementPath, self.convert_top_level_paths, SQL_TOP_LEVEL_PATH_QUERY, ignore_conflicts=False)
# Create entities, entity types, roles and links
# Create entities and entity types
self.bulk_create_objects(EntityType, self.convert_entity_types, SQL_ENTITY_TYPE_QUERY)
self.bulk_create_objects(Entity, self.convert_entities, SQL_ENTITY_QUERY)
self.bulk_create_objects(EntityRole, self.convert_entity_roles, SQL_ENTITY_ROLE_QUERY)
self.bulk_create_objects(EntityLink, self.convert_entity_links, SQL_ENTITY_LINK_QUERY)
# Create transcriptions and transcription entities
self.bulk_create_objects(Transcription, self.convert_transcriptions, SQL_TRANSCRIPTION_QUERY)
......
......@@ -135,24 +135,6 @@ class Command(BaseCommand):
""")
self.stdout.write(f"Updated {cursor.rowcount} TranscriptionEntities.")
self.stdout.write("Updating child entity IDs on entity links…")
cursor.execute("""
UPDATE documents_entitylink
SET child_id = keep_id
FROM duplicated_entities
WHERE child_id = remove_id;
""")
self.stdout.write(f"Updated {cursor.rowcount} entity links.")
self.stdout.write("Updating parent entity IDs on entity links…")
cursor.execute("""
UPDATE documents_entitylink
SET parent_id = keep_id
FROM duplicated_entities
WHERE parent_id = remove_id;
""")
self.stdout.write(f"Updated {cursor.rowcount} entity links.")
self.stdout.write("Removing duplicate entities…")
cursor.execute("""
DELETE FROM documents_entity
......