From a1930eeeb28bc04f827deef0561bc26d3056d614 Mon Sep 17 00:00:00 2001 From: Erwan Rouchet <rouchet@teklia.com> Date: Thu, 29 Oct 2020 13:43:21 +0000 Subject: [PATCH] Optimize element deletion using raw SQL --- arkindex/documents/api/elements.py | 8 +- arkindex/documents/deletion.py | 94 +++++++++++++++++++ .../documents/management/commands/delete.py | 9 +- arkindex/documents/models.py | 18 ++-- arkindex/documents/signals.py | 65 +------------ .../tests/consumers/test_corpus_consumer.py | 2 +- arkindex/documents/tests/test_corpus.py | 20 ---- .../documents/tests/test_destroy_elements.py | 23 ++++- .../documents/tests/test_retrieve_elements.py | 13 +-- 9 files changed, 139 insertions(+), 113 deletions(-) create mode 100644 arkindex/documents/deletion.py diff --git a/arkindex/documents/api/elements.py b/arkindex/documents/api/elements.py index 1d4993c36b..3dd734d9a5 100644 --- a/arkindex/documents/api/elements.py +++ b/arkindex/documents/api/elements.py @@ -513,8 +513,12 @@ class ElementRetrieve(RetrieveUpdateDestroyAPIView): def get_queryset(self): corpora = Corpus.objects.readable(self.request.user) - return Element.objects \ - .filter(corpus__in=corpora) \ + queryset = Element.objects.filter(corpus__in=corpora) + if self.request and self.request.method == 'DELETE': + # Only include corpus for ACL check and ID for deletion + return queryset.select_related('corpus').only('id', 'corpus') + + return queryset \ .select_related( 'corpus', 'type', diff --git a/arkindex/documents/deletion.py b/arkindex/documents/deletion.py new file mode 100644 index 0000000000..34fd83a0f8 --- /dev/null +++ b/arkindex/documents/deletion.py @@ -0,0 +1,94 @@ +import logging +from uuid import UUID + +from django.db import connections + +logger = logging.getLogger(__name__) + + +def delete_element(element_id: UUID) -> None: + assert isinstance(element_id, UUID) + with connections['default'].cursor() as cursor: + # Remove transcription-entity links + cursor.execute(""" + DELETE FROM documents_transcriptionentity te WHERE transcription_id IN ( + SELECT t.id FROM documents_transcription t + LEFT JOIN documents_elementpath elementpath USING (element_id) + WHERE t.element_id = %(id)s OR elementpath.path && ARRAY[%(id)s] + ) + """, {'id': element_id}) + logger.info(f"Deleted {cursor.rowcount} transcription-entity links") + + # Remove transcriptions + cursor.execute(""" + DELETE FROM documents_transcription + WHERE element_id = %(id)s + OR element_id IN ( + SELECT element_id FROM documents_elementpath WHERE path && ARRAY[%(id)s] + ) + """, {'id': element_id}) + logger.info(f"Deleted {cursor.rowcount} transcriptions") + + # Remove classifications + cursor.execute(""" + DELETE FROM documents_classification + WHERE element_id = %(id)s + OR element_id IN ( + SELECT element_id FROM documents_elementpath WHERE path && ARRAY[%(id)s] + ) + """, {'id': element_id}) + logger.info(f"Deleted {cursor.rowcount} classifications") + + # Remove element metadata + cursor.execute(""" + DELETE FROM documents_metadata + WHERE element_id = %(id)s + OR element_id IN ( + SELECT element_id FROM documents_elementpath WHERE path && ARRAY[%(id)s] + ) + """, {'id': element_id}) + logger.info(f"Deleted {cursor.rowcount} metadata") + + # Remove element-dataimport links + cursor.execute(""" + DELETE FROM dataimport_dataimportelement + WHERE element_id = %(id)s + OR element_id IN ( + SELECT element_id FROM documents_elementpath WHERE path && ARRAY[%(id)s] + ) + """, {'id': element_id}) + logger.info(f"Deleted {cursor.rowcount} usage from dataimport as selection") + + # Set elements directly on dataimports to None + cursor.execute(""" + UPDATE dataimport_dataimport + SET element_id = NULL + WHERE element_id = %(id)s + OR element_id IN ( + SELECT element_id FROM documents_elementpath WHERE path && ARRAY[%(id)s] + ) + """, {'id': element_id}) + logger.info(f"Deleted {cursor.rowcount} usage from dataimport as element") + + # Remove user selections + cursor.execute(""" + DELETE FROM documents_selection selection + WHERE element_id = %(id)s + OR element_id IN ( + SELECT element_id FROM documents_elementpath WHERE path && ARRAY[%(id)s] + ) + """, {'id': element_id}) + logger.info(f"Deleted {cursor.rowcount} user selections") + + # Remove element paths and elements simultaneously + cursor.execute(""" + WITH children_ids (id) AS ( + DELETE FROM documents_elementpath + WHERE element_id = %(id)s OR path && ARRAY[%(id)s] + RETURNING element_id + ) + DELETE FROM documents_element element + USING children_ids + WHERE element.id = children_ids.id + """, {'id': element_id}) + logger.info(f"Deleted {cursor.rowcount} child elements with paths") diff --git a/arkindex/documents/management/commands/delete.py b/arkindex/documents/management/commands/delete.py index 44c22f0d6b..f57cfa34d0 100644 --- a/arkindex/documents/management/commands/delete.py +++ b/arkindex/documents/management/commands/delete.py @@ -5,12 +5,13 @@ import uuid from django.core.management.base import BaseCommand from arkindex.documents.models import Element +from arkindex_common.tools import Timer +# Enable deletion signal logs logging.basicConfig( level=logging.INFO, format='[%(levelname)s] %(message)s', ) -logger = logging.getLogger(__name__) def valid_uuid(value): @@ -34,5 +35,7 @@ class Command(BaseCommand): def handle(self, element_id, verbosity=0, **options): for element in Element.objects.filter(id__in=element_id): - logger.info(f"Deleting {element.id} : {element}") - element.delete() + self.stdout.write(f"Deleting {element.id} : {element}") + with Timer() as t: + element.delete() + self.stdout.write(f'Deleted {element} in {t.delta}') diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 2e454de3da..c687b1549e 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -11,6 +11,7 @@ from django.utils.functional import cached_property from enumfields import Enum, EnumField from arkindex.documents.dates import InterpretedDateMixin +from arkindex.documents.deletion import delete_element from arkindex.documents.managers import CorpusManager, ElementManager from arkindex.project.default_corpus import DEFAULT_CORPUS_TYPES, DEFAULT_TRANSKRIBUS_TYPES from arkindex.project.elastic import ESElement, ESEntity, ESTranscription @@ -55,15 +56,6 @@ class Corpus(IndexableModel): def __str__(self): return self.name - def delete(self): - """ - Override the normal deletion process to delete elements before deleting the corpus: - prevents a ProtectedError when element types are being deleted before elements. - """ - with transaction.atomic(): - self.elements.all().delete() - super().delete() - def get_acl_rights(self, user): ''' List Access rights on this corpus for this user @@ -174,6 +166,14 @@ class Element(IndexableModel): es_document = ESElement objects = ElementManager() + def delete(self, *args, **kwargs): + """ + Use raw SQL for element deletion and bypass Django's cascading + """ + assert self.id, 'Cannot delete an element without an ID' + delete_element(self.id) + Element.objects.filter(id=self.id)._raw_delete(using='default') + @transaction.atomic def add_parent(self, parent, skip_children=False): ''' diff --git a/arkindex/documents/signals.py b/arkindex/documents/signals.py index 446c6f9c13..fb937a661e 100644 --- a/arkindex/documents/signals.py +++ b/arkindex/documents/signals.py @@ -1,68 +1,9 @@ -import logging - -from django.db.models import Q from django.db.models.signals import pre_delete from django.dispatch import receiver -from arkindex.dataimport.models import DataImport, DataImportElement -from arkindex.documents.models import ( - Classification, - Element, - ElementPath, - MetaData, - Selection, - Transcription, - TranscriptionEntity, -) - -logger = logging.getLogger(__name__) +from arkindex.documents.models import Element @receiver(pre_delete, sender=Element) -def pre_delete_handler(sender, instance, using, *args, **kwargs): - assert isinstance(instance, Element) - - # Build queryset to lookup all children, with or without top element - children = Element.objects.get_descending(instance.id).values('id') - elements = Q(element_id__in=children) | Q(element_id=instance.id) - - # Remove all transcription entity links - deleted, _ = TranscriptionEntity.objects.filter(Q(transcription__element_id__in=children) | Q(transcription__element_id=instance.id)).delete() - logger.info(f"Deleted {deleted} transcriptions entities") - - # Remove all transcriptions - deleted, _ = Transcription.objects.filter(elements).delete() - logger.info(f"Deleted {deleted} transcriptions") - - # Remove all classifications - deleted, _ = Classification.objects.filter(elements).delete() - logger.info(f"Deleted {deleted} classifications") - - # Remove all metadatas - deleted, _ = MetaData.objects.filter(elements).delete() - logger.info(f"Deleted {deleted} metadatas") - - # Remove all selections from users - deleted, _ = Selection.objects.filter(elements).delete() - logger.info(f"Deleted {deleted} user selections") - - # Remove all usage from a dataimport from users selection - deleted, _ = DataImportElement.objects.filter(elements).delete() - logger.info(f"Deleted {deleted} usage from dataimport as selection") - - # Remove all direct usage on a dataimport, without removing the dataimport - updated = DataImport.objects.filter(elements).update(element_id=None) - logger.info(f"Deleted {updated} usage from dataimport as element") - - # Save children IDs reference before deleting the paths - # or the final element deletion query will never get anything to delete - children_ids = list(children.values_list('id', flat=True)) - - # Remove all paths for these elements - deleted, _ = ElementPath.objects.filter(elements).delete() - logger.info(f"Deleted {deleted} paths") - - # Finally Remove all children, calling this pre_delete signal recursively - # The instance element will be deleted by the parent delete() call - deleted = Element.objects.filter(id__in=children_ids)._raw_delete(using=using) - logger.info(f"Deleted {deleted} children") +def pre_delete_handler(sender, instance, *args, **kwargs): + raise NotImplementedError('Deleting elements from a QuerySet is not supported. Please call Element.delete on each element.') diff --git a/arkindex/documents/tests/consumers/test_corpus_consumer.py b/arkindex/documents/tests/consumers/test_corpus_consumer.py index af38dcf620..619fa828a4 100644 --- a/arkindex/documents/tests/consumers/test_corpus_consumer.py +++ b/arkindex/documents/tests/consumers/test_corpus_consumer.py @@ -84,7 +84,7 @@ class TestDeleteCorpus(FixtureTestCase): self.corpus.repository = self.repo self.corpus.save() - with self.assertNumQueries(50): + with self.assertNumQueries(47): CorpusConsumer({}).corpus_delete({'id': str(self.corpus.id)}) # Ensure the command restores the signal receivers diff --git a/arkindex/documents/tests/test_corpus.py b/arkindex/documents/tests/test_corpus.py index b816b02bd7..27ea5fa446 100644 --- a/arkindex/documents/tests/test_corpus.py +++ b/arkindex/documents/tests/test_corpus.py @@ -3,7 +3,6 @@ from unittest.mock import patch import mock from django.contrib.auth.models import AnonymousUser -from django.db.models import Model, ProtectedError from django.urls import reverse from mock import AsyncMock from rest_framework import status @@ -401,22 +400,3 @@ class TestCorpus(FixtureAPITestCase): 'batch_size': 1000, } ) - - def test_delete_elements_first(self): - """ - Ensure corpus deletion deletes elements first to prevent a ProtectedError - """ - self.assertEqual(Corpus.objects.count(), 3) - self.assertEqual(Element.objects.count(), 29) - - # Use Django's default deletion method and expect a ProtectedError - with self.assertRaises(ProtectedError): - Model.delete(self.corpus) - - self.assertEqual(Corpus.objects.count(), 3) - self.assertEqual(Element.objects.count(), 29) - - # Use the corpus deletion method and it works! - self.corpus.delete() - self.assertEqual(Corpus.objects.count(), 2) - self.assertEqual(Element.objects.count(), 0) diff --git a/arkindex/documents/tests/test_destroy_elements.py b/arkindex/documents/tests/test_destroy_elements.py index a4e43612a3..982bd03888 100644 --- a/arkindex/documents/tests/test_destroy_elements.py +++ b/arkindex/documents/tests/test_destroy_elements.py @@ -1,7 +1,7 @@ from django.urls import reverse from rest_framework import status -from arkindex.documents.models import Corpus +from arkindex.documents.models import Corpus, Element from arkindex.project.tests import FixtureAPITestCase @@ -15,7 +15,8 @@ class TestDestroyElements(FixtureAPITestCase): cls.private_corpus = Corpus.objects.create(name='private', public=False) def test_element_destroy_verified_user(self): - response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(self.vol.id)})) + with self.assertNumQueries(0): + response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(self.vol.id)})) self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) def test_element_destroy_acl(self): @@ -25,7 +26,8 @@ class TestDestroyElements(FixtureAPITestCase): type=self.volume_type, name='Castle story' ) - response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(castle_story.id)})) + with self.assertNumQueries(4): + response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(castle_story.id)})) self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) self.assertDictEqual( response.json(), @@ -39,6 +41,19 @@ class TestDestroyElements(FixtureAPITestCase): name='Castle story' ) self.assertEqual(self.corpus.elements.filter(id=castle_story.id).exists(), True) - response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(castle_story.id)})) + with self.assertNumQueries(13): + response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(castle_story.id)})) self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) self.assertEqual(self.corpus.elements.filter(id=castle_story.id).exists(), False) + + def test_non_empty_element(self): + """ + We can now delete a non-empty element + """ + self.client.force_login(self.user) + with self.assertNumQueries(13): + response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(self.vol.id)})) + self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) + + with self.assertRaises(Element.DoesNotExist): + self.vol.refresh_from_db() diff --git a/arkindex/documents/tests/test_retrieve_elements.py b/arkindex/documents/tests/test_retrieve_elements.py index aae92d1a99..b9e593af2f 100644 --- a/arkindex/documents/tests/test_retrieve_elements.py +++ b/arkindex/documents/tests/test_retrieve_elements.py @@ -1,7 +1,7 @@ from django.urls import reverse from rest_framework import status -from arkindex.documents.models import Classification, Corpus, DataSource, Element, Entity, MLClass +from arkindex.documents.models import Classification, Corpus, DataSource, Entity, MLClass from arkindex.project.tests import FixtureAPITestCase from arkindex_common.enums import EntityType, MetaType from arkindex_common.ml_tool import MLToolType @@ -127,17 +127,6 @@ class TestRetrieveElements(FixtureAPITestCase): self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertIsNone(response.json()['thumbnail_put_url']) - def test_non_empty_folder(self): - """ - We can now delete a non-empty folder - """ - self.client.force_login(self.user) - response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(self.vol.id)})) - self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) - - with self.assertRaises(Element.DoesNotExist): - self.vol.refresh_from_db() - def test_list_element_metadata(self): with self.assertNumQueries(6): response = self.client.get(reverse('api:element-retrieve', kwargs={'pk': str(self.page.id)})) -- GitLab