From 853f8007836134ec015dfd70f647b9803151ab1c Mon Sep 17 00:00:00 2001 From: Erwan Rouchet <rouchet@teklia.com> Date: Thu, 4 Jul 2024 15:06:16 +0200 Subject: [PATCH] Delete entities in DestroyWorkerResults on corpora --- arkindex/documents/tasks.py | 21 +++++++++++++ .../tests/tasks/test_worker_results_delete.py | 2 ++ .../worker_results_delete_all_versions.sql | 24 ++++++++++++++- ...er_results_delete_configuration_filter.sql | 30 +++++++++++++++++++ .../worker_results_delete_in_corpus.sql | 21 +++++++++++++ ...er_results_delete_in_corpus_worker_run.sql | 23 +++++++++++++- 6 files changed, 119 insertions(+), 2 deletions(-) diff --git a/arkindex/documents/tasks.py b/arkindex/documents/tasks.py index ce4c680ae9..80ece7ecc1 100644 --- a/arkindex/documents/tasks.py +++ b/arkindex/documents/tasks.py @@ -16,6 +16,7 @@ from arkindex.documents.models import ( Corpus, Element, ElementPath, + Entity, MetaData, Selection, Transcription, @@ -176,6 +177,7 @@ def worker_results_delete( Q(transcription__element__corpus_id=corpus_id) | Q(entity__corpus_id=corpus_id) ) + entities = Entity.objects.filter(corpus_id=corpus_id) metadata = MetaData.objects.filter(element__corpus_id=corpus_id) worker_activities = WorkerActivity.objects.filter(element__corpus_id=corpus_id) @@ -186,6 +188,7 @@ def worker_results_delete( transcriptions = transcriptions.filter(worker_run_id=worker_run_id) transcription_entities = transcription_entities.filter(transcription__worker_run_id=worker_run_id) worker_transcription_entities = worker_transcription_entities.filter(worker_run_id=worker_run_id) + entities = entities.filter(worker_run_id=worker_run_id) metadata = metadata.filter(worker_run_id=worker_run_id) # There is no worker_run_id on Worker Activities so the best thing we can do is delete the worker activities # attached to the elements produced with that worker run, and they are already being deleted by elements.trash() @@ -197,6 +200,7 @@ def worker_results_delete( transcriptions = transcriptions.filter(worker_version_id=version_id) transcription_entities = transcription_entities.filter(transcription__worker_version_id=version_id) worker_transcription_entities = worker_transcription_entities.filter(worker_version_id=version_id) + entities = entities.filter(worker_version_id=version_id) metadata = metadata.filter(worker_version_id=version_id) worker_activities = worker_activities.filter(worker_version_id=version_id) # Otherwise, select everything that has any worker version ID. (When something has been created @@ -211,9 +215,14 @@ def worker_results_delete( transcriptions = transcriptions.exclude(worker_version_id=None) transcription_entities = transcription_entities.exclude(transcription__worker_version_id=None) worker_transcription_entities = worker_transcription_entities.exclude(worker_version_id=None) + entities = entities.exclude(worker_version_id=None) metadata = metadata.exclude(worker_version_id=None) if element_id: + # When running with a parent element, we do not delete any entities, + # as checking whether they are used in other elements is too costly. + entities = entities.none() + # include_children causes a deletion *only* on the element's descendants. # To also delete on the element itself, we will call this task synchronously with the same arguments and include_children=False. # This is used to avoid filtering by Q(id=element_id) | Q(paths__path__overlap=[element_id]) all at once, @@ -245,6 +254,7 @@ def worker_results_delete( transcriptions = transcriptions.filter(worker_run__model_version_id=model_version_id) transcription_entities = transcription_entities.filter(transcription__worker_run__model_version_id=model_version_id) worker_transcription_entities = worker_transcription_entities.filter(worker_run__model_version_id=model_version_id) + entities = entities.filter(worker_run__model_version_id=model_version_id) metadata = metadata.filter(worker_run__model_version_id=model_version_id) worker_activities = worker_activities.filter(model_version_id=model_version_id) @@ -256,6 +266,7 @@ def worker_results_delete( transcriptions = transcriptions.filter(worker_run__configuration_id=None) transcription_entities = transcription_entities.filter(transcription__worker_run__configuration_id=None) worker_transcription_entities = worker_transcription_entities.filter(worker_run__configuration_id=None) + entities = entities.filter(worker_run__configuration_id=None) metadata = metadata.filter(worker_run__configuration_id=None) worker_activities = worker_activities.filter(configuration_id=None) else: @@ -264,16 +275,26 @@ def worker_results_delete( transcriptions = transcriptions.filter(worker_run__configuration_id=configuration_id) transcription_entities = transcription_entities.filter(transcription__worker_run__configuration_id=configuration_id) worker_transcription_entities = worker_transcription_entities.filter(worker_run__configuration_id=configuration_id) + entities = entities.filter(worker_run__configuration_id=configuration_id) metadata = metadata.filter(worker_run__configuration_id=configuration_id) worker_activities = worker_activities.filter(configuration_id=configuration_id) elements.trash() classifications.delete() + # Delete TranscriptionEntities before transcriptions so that we can delete transcriptions using a single DELETE query transcription_entities.delete() worker_transcription_entities.delete() transcriptions._raw_delete(using="default") + metadata.delete() + + # Remove entities from metadata and remove their associated transcription entities first + # so that entities can be deleted in one query. + MetaData.objects.filter(entity__in=entities).update(entity_id=None) + TranscriptionEntity.objects.filter(entity__in=entities).delete() + entities._raw_delete(using="default") + worker_activities.delete() if element_id and include_children: diff --git a/arkindex/documents/tests/tasks/test_worker_results_delete.py b/arkindex/documents/tests/tasks/test_worker_results_delete.py index 52546e1bac..2906bc25b4 100644 --- a/arkindex/documents/tests/tasks/test_worker_results_delete.py +++ b/arkindex/documents/tests/tasks/test_worker_results_delete.py @@ -211,6 +211,7 @@ class TestDeleteWorkerResults(FixtureTestCase): self.transcription2, self.transcription_entity1, self.transcription_entity2, + self.entity, ) def test_run_no_configuration_filter(self): @@ -261,6 +262,7 @@ class TestDeleteWorkerResults(FixtureTestCase): self.transcription_entity1, self.transcription_entity2, self.page2, + self.entity, ) def test_run_dataset_failure(self): diff --git a/arkindex/sql_validation/worker_results_delete_all_versions.sql b/arkindex/sql_validation/worker_results_delete_all_versions.sql index afbb1ee13b..dc5367c8ac 100644 --- a/arkindex/sql_validation/worker_results_delete_all_versions.sql +++ b/arkindex/sql_validation/worker_results_delete_all_versions.sql @@ -268,9 +268,31 @@ WHERE "documents_metadata"."id" IN WHERE (U1."corpus_id" = '{corpus_id}'::uuid AND NOT (U0."worker_version_id" IS NULL))); +UPDATE "documents_metadata" +SET "entity_id" = NULL +WHERE "documents_metadata"."entity_id" IN + (SELECT U0."id" + FROM "documents_entity" U0 + WHERE (U0."corpus_id" = '{corpus_id}'::uuid + AND NOT (U0."worker_version_id" IS NULL))); + +DELETE +FROM "documents_transcriptionentity" +WHERE "documents_transcriptionentity"."entity_id" IN + (SELECT U0."id" + FROM "documents_entity" U0 + WHERE (U0."corpus_id" = '{corpus_id}'::uuid + AND NOT (U0."worker_version_id" IS NULL))); + +DELETE +FROM "documents_entity" +WHERE ("documents_entity"."corpus_id" = '{corpus_id}'::uuid + AND NOT ("documents_entity"."worker_version_id" IS NULL)); + DELETE FROM "process_workeractivity" -WHERE "process_workeractivity"."id" IN (SELECT U0."id" +WHERE "process_workeractivity"."id" IN + (SELECT U0."id" FROM "process_workeractivity" U0 INNER JOIN "documents_element" U1 ON (U0."element_id" = U1."id") WHERE U1."corpus_id" = '{corpus_id}'::uuid) diff --git a/arkindex/sql_validation/worker_results_delete_configuration_filter.sql b/arkindex/sql_validation/worker_results_delete_configuration_filter.sql index f1e2ed0627..9ee04a8506 100644 --- a/arkindex/sql_validation/worker_results_delete_configuration_filter.sql +++ b/arkindex/sql_validation/worker_results_delete_configuration_filter.sql @@ -175,6 +175,36 @@ WHERE "documents_metadata"."id" IN AND NOT (U0."worker_version_id" IS NULL) AND U4."configuration_id" = '{configuration_id}'::uuid)); +UPDATE "documents_metadata" +SET "entity_id" = NULL +WHERE "documents_metadata"."entity_id" IN + (SELECT U0."id" + FROM "documents_entity" U0 + INNER JOIN "process_workerrun" U3 ON (U0."worker_run_id" = U3."id") + WHERE (U0."corpus_id" = '{corpus_id}'::uuid + AND NOT (U0."worker_version_id" IS NULL) + AND U3."configuration_id" = '{configuration_id}'::uuid)); + +DELETE +FROM "documents_transcriptionentity" +WHERE "documents_transcriptionentity"."entity_id" IN + (SELECT U0."id" + FROM "documents_entity" U0 + INNER JOIN "process_workerrun" U3 ON (U0."worker_run_id" = U3."id") + WHERE (U0."corpus_id" = '{corpus_id}'::uuid + AND NOT (U0."worker_version_id" IS NULL) + AND U3."configuration_id" = '{configuration_id}'::uuid)); + +DELETE +FROM "documents_entity" +WHERE "documents_entity"."id" IN + (SELECT U0."id" + FROM "documents_entity" U0 + INNER JOIN "process_workerrun" U3 ON (U0."worker_run_id" = U3."id") + WHERE (U0."corpus_id" = '{corpus_id}'::uuid + AND NOT (U0."worker_version_id" IS NULL) + AND U3."configuration_id" = '{configuration_id}'::uuid)); + DELETE FROM "process_workeractivity" WHERE "process_workeractivity"."id" IN diff --git a/arkindex/sql_validation/worker_results_delete_in_corpus.sql b/arkindex/sql_validation/worker_results_delete_in_corpus.sql index 3f7e788120..e9f17f739d 100644 --- a/arkindex/sql_validation/worker_results_delete_in_corpus.sql +++ b/arkindex/sql_validation/worker_results_delete_in_corpus.sql @@ -268,6 +268,27 @@ WHERE "documents_metadata"."id" IN WHERE (U1."corpus_id" = '{corpus_id}'::uuid AND U0."worker_version_id" = '{version_id}'::uuid)); +UPDATE "documents_metadata" +SET "entity_id" = NULL +WHERE "documents_metadata"."entity_id" IN + (SELECT U0."id" + FROM "documents_entity" U0 + WHERE (U0."corpus_id" = '{corpus_id}'::uuid + AND U0."worker_version_id" = '{version_id}'::uuid)); + +DELETE +FROM "documents_transcriptionentity" +WHERE "documents_transcriptionentity"."entity_id" IN + (SELECT U0."id" + FROM "documents_entity" U0 + WHERE (U0."corpus_id" = '{corpus_id}'::uuid + AND U0."worker_version_id" = '{version_id}'::uuid)); + +DELETE +FROM "documents_entity" +WHERE ("documents_entity"."corpus_id" = '{corpus_id}'::uuid + AND "documents_entity"."worker_version_id" = '{version_id}'::uuid); + DELETE FROM "process_workeractivity" WHERE "process_workeractivity"."id" IN diff --git a/arkindex/sql_validation/worker_results_delete_in_corpus_worker_run.sql b/arkindex/sql_validation/worker_results_delete_in_corpus_worker_run.sql index e01a89ead2..dcb0166086 100644 --- a/arkindex/sql_validation/worker_results_delete_in_corpus_worker_run.sql +++ b/arkindex/sql_validation/worker_results_delete_in_corpus_worker_run.sql @@ -141,4 +141,25 @@ WHERE "documents_metadata"."id" IN FROM "documents_metadata" U0 INNER JOIN "documents_element" U1 ON (U0."element_id" = U1."id") WHERE (U1."corpus_id" = '{corpus_id}'::uuid - AND U0."worker_run_id" = '{worker_run_id}'::uuid)) + AND U0."worker_run_id" = '{worker_run_id}'::uuid)); + +UPDATE "documents_metadata" +SET "entity_id" = NULL +WHERE "documents_metadata"."entity_id" IN + (SELECT U0."id" + FROM "documents_entity" U0 + WHERE (U0."corpus_id" = '{corpus_id}'::uuid + AND U0."worker_run_id" = '{worker_run_id}'::uuid)); + +DELETE +FROM "documents_transcriptionentity" +WHERE "documents_transcriptionentity"."entity_id" IN + (SELECT U0."id" + FROM "documents_entity" U0 + WHERE (U0."corpus_id" = '{corpus_id}'::uuid + AND U0."worker_run_id" = '{worker_run_id}'::uuid)); + +DELETE +FROM "documents_entity" +WHERE ("documents_entity"."corpus_id" = '{corpus_id}'::uuid + AND "documents_entity"."worker_run_id" = '{worker_run_id}'::uuid) -- GitLab