From a1930eeeb28bc04f827deef0561bc26d3056d614 Mon Sep 17 00:00:00 2001
From: Erwan Rouchet <rouchet@teklia.com>
Date: Thu, 29 Oct 2020 13:43:21 +0000
Subject: [PATCH] Optimize element deletion using raw SQL

---
 arkindex/documents/api/elements.py            |  8 +-
 arkindex/documents/deletion.py                | 94 +++++++++++++++++++
 .../documents/management/commands/delete.py   |  9 +-
 arkindex/documents/models.py                  | 18 ++--
 arkindex/documents/signals.py                 | 65 +------------
 .../tests/consumers/test_corpus_consumer.py   |  2 +-
 arkindex/documents/tests/test_corpus.py       | 20 ----
 .../documents/tests/test_destroy_elements.py  | 23 ++++-
 .../documents/tests/test_retrieve_elements.py | 13 +--
 9 files changed, 139 insertions(+), 113 deletions(-)
 create mode 100644 arkindex/documents/deletion.py

diff --git a/arkindex/documents/api/elements.py b/arkindex/documents/api/elements.py
index 1d4993c36b..3dd734d9a5 100644
--- a/arkindex/documents/api/elements.py
+++ b/arkindex/documents/api/elements.py
@@ -513,8 +513,12 @@ class ElementRetrieve(RetrieveUpdateDestroyAPIView):
 
     def get_queryset(self):
         corpora = Corpus.objects.readable(self.request.user)
-        return Element.objects \
-            .filter(corpus__in=corpora) \
+        queryset = Element.objects.filter(corpus__in=corpora)
+        if self.request and self.request.method == 'DELETE':
+            # Only include corpus for ACL check and ID for deletion
+            return queryset.select_related('corpus').only('id', 'corpus')
+
+        return queryset \
             .select_related(
                 'corpus',
                 'type',
diff --git a/arkindex/documents/deletion.py b/arkindex/documents/deletion.py
new file mode 100644
index 0000000000..34fd83a0f8
--- /dev/null
+++ b/arkindex/documents/deletion.py
@@ -0,0 +1,94 @@
+import logging
+from uuid import UUID
+
+from django.db import connections
+
+logger = logging.getLogger(__name__)
+
+
+def delete_element(element_id: UUID) -> None:
+    assert isinstance(element_id, UUID)
+    with connections['default'].cursor() as cursor:
+        # Remove transcription-entity links
+        cursor.execute("""
+        DELETE FROM documents_transcriptionentity te WHERE transcription_id IN (
+            SELECT t.id FROM documents_transcription t
+            LEFT JOIN documents_elementpath elementpath USING (element_id)
+            WHERE t.element_id = %(id)s OR elementpath.path && ARRAY[%(id)s]
+        )
+        """, {'id': element_id})
+        logger.info(f"Deleted {cursor.rowcount} transcription-entity links")
+
+        # Remove transcriptions
+        cursor.execute("""
+        DELETE FROM documents_transcription
+        WHERE element_id = %(id)s
+        OR element_id IN (
+            SELECT element_id FROM documents_elementpath WHERE path && ARRAY[%(id)s]
+        )
+        """, {'id': element_id})
+        logger.info(f"Deleted {cursor.rowcount} transcriptions")
+
+        # Remove classifications
+        cursor.execute("""
+        DELETE FROM documents_classification
+        WHERE element_id = %(id)s
+        OR element_id IN (
+            SELECT element_id FROM documents_elementpath WHERE path && ARRAY[%(id)s]
+        )
+        """, {'id': element_id})
+        logger.info(f"Deleted {cursor.rowcount} classifications")
+
+        # Remove element metadata
+        cursor.execute("""
+        DELETE FROM documents_metadata
+        WHERE element_id = %(id)s
+        OR element_id IN (
+            SELECT element_id FROM documents_elementpath WHERE path && ARRAY[%(id)s]
+        )
+        """, {'id': element_id})
+        logger.info(f"Deleted {cursor.rowcount} metadata")
+
+        # Remove element-dataimport links
+        cursor.execute("""
+        DELETE FROM dataimport_dataimportelement
+        WHERE element_id = %(id)s
+        OR element_id IN (
+            SELECT element_id FROM documents_elementpath WHERE path && ARRAY[%(id)s]
+        )
+        """, {'id': element_id})
+        logger.info(f"Deleted {cursor.rowcount} usage from dataimport as selection")
+
+        # Set elements directly on dataimports to None
+        cursor.execute("""
+        UPDATE dataimport_dataimport
+        SET element_id = NULL
+        WHERE element_id = %(id)s
+        OR element_id IN (
+            SELECT element_id FROM documents_elementpath WHERE path && ARRAY[%(id)s]
+        )
+        """, {'id': element_id})
+        logger.info(f"Deleted {cursor.rowcount} usage from dataimport as element")
+
+        # Remove user selections
+        cursor.execute("""
+        DELETE FROM documents_selection selection
+        WHERE element_id = %(id)s
+        OR element_id IN (
+            SELECT element_id FROM documents_elementpath WHERE path && ARRAY[%(id)s]
+        )
+        """, {'id': element_id})
+        logger.info(f"Deleted {cursor.rowcount} user selections")
+
+        # Remove element paths and elements simultaneously
+        cursor.execute("""
+        WITH children_ids (id) AS (
+            DELETE FROM documents_elementpath
+            WHERE element_id = %(id)s OR path && ARRAY[%(id)s]
+            RETURNING element_id
+        )
+        DELETE FROM documents_element element
+        USING children_ids
+        WHERE element.id = children_ids.id
+        """, {'id': element_id})
+        logger.info(f"Deleted {cursor.rowcount} child elements with paths")
diff --git a/arkindex/documents/management/commands/delete.py b/arkindex/documents/management/commands/delete.py
index 44c22f0d6b..f57cfa34d0 100644
--- a/arkindex/documents/management/commands/delete.py
+++ b/arkindex/documents/management/commands/delete.py
@@ -5,12 +5,13 @@ import uuid
 from django.core.management.base import BaseCommand
 
 from arkindex.documents.models import Element
+from arkindex_common.tools import Timer
 
+# Enable deletion signal logs
 logging.basicConfig(
     level=logging.INFO,
     format='[%(levelname)s] %(message)s',
 )
-logger = logging.getLogger(__name__)
 
 
 def valid_uuid(value):
@@ -34,5 +35,7 @@ class Command(BaseCommand):
 
     def handle(self, element_id, verbosity=0, **options):
         for element in Element.objects.filter(id__in=element_id):
-            logger.info(f"Deleting {element.id} : {element}")
-            element.delete()
+            self.stdout.write(f"Deleting {element.id} : {element}")
+            with Timer() as t:
+                element.delete()
+            self.stdout.write(f'Deleted {element} in {t.delta}')
diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py
index 2e454de3da..c687b1549e 100644
--- a/arkindex/documents/models.py
+++ b/arkindex/documents/models.py
@@ -11,6 +11,7 @@ from django.utils.functional import cached_property
 from enumfields import Enum, EnumField
 
 from arkindex.documents.dates import InterpretedDateMixin
+from arkindex.documents.deletion import delete_element
 from arkindex.documents.managers import CorpusManager, ElementManager
 from arkindex.project.default_corpus import DEFAULT_CORPUS_TYPES, DEFAULT_TRANSKRIBUS_TYPES
 from arkindex.project.elastic import ESElement, ESEntity, ESTranscription
@@ -55,15 +56,6 @@ class Corpus(IndexableModel):
     def __str__(self):
         return self.name
 
-    def delete(self):
-        """
-        Override the normal deletion process to delete elements before deleting the corpus:
-        prevents a ProtectedError when element types are being deleted before elements.
-        """
-        with transaction.atomic():
-            self.elements.all().delete()
-            super().delete()
-
     def get_acl_rights(self, user):
         '''
         List Access rights on this corpus for this user
@@ -174,6 +166,14 @@ class Element(IndexableModel):
     es_document = ESElement
     objects = ElementManager()
 
+    def delete(self, *args, **kwargs):
+        """
+        Use raw SQL for element deletion and bypass Django's cascading
+        """
+        assert self.id, 'Cannot delete an element without an ID'
+        delete_element(self.id)
+        Element.objects.filter(id=self.id)._raw_delete(using='default')
+
     @transaction.atomic
     def add_parent(self, parent, skip_children=False):
         '''
diff --git a/arkindex/documents/signals.py b/arkindex/documents/signals.py
index 446c6f9c13..fb937a661e 100644
--- a/arkindex/documents/signals.py
+++ b/arkindex/documents/signals.py
@@ -1,68 +1,9 @@
-import logging
-
-from django.db.models import Q
 from django.db.models.signals import pre_delete
 from django.dispatch import receiver
 
-from arkindex.dataimport.models import DataImport, DataImportElement
-from arkindex.documents.models import (
-    Classification,
-    Element,
-    ElementPath,
-    MetaData,
-    Selection,
-    Transcription,
-    TranscriptionEntity,
-)
-
-logger = logging.getLogger(__name__)
+from arkindex.documents.models import Element
 
 
 @receiver(pre_delete, sender=Element)
-def pre_delete_handler(sender, instance, using, *args, **kwargs):
-    assert isinstance(instance, Element)
-
-    # Build queryset to lookup all children, with or without top element
-    children = Element.objects.get_descending(instance.id).values('id')
-    elements = Q(element_id__in=children) | Q(element_id=instance.id)
-
-    # Remove all transcription entity links
-    deleted, _ = TranscriptionEntity.objects.filter(Q(transcription__element_id__in=children) | Q(transcription__element_id=instance.id)).delete()
-    logger.info(f"Deleted {deleted} transcriptions entities")
-
-    # Remove all transcriptions
-    deleted, _ = Transcription.objects.filter(elements).delete()
-    logger.info(f"Deleted {deleted} transcriptions")
-
-    # Remove all classifications
-    deleted, _ = Classification.objects.filter(elements).delete()
-    logger.info(f"Deleted {deleted} classifications")
-
-    # Remove all metadatas
-    deleted, _ = MetaData.objects.filter(elements).delete()
-    logger.info(f"Deleted {deleted} metadatas")
-
-    # Remove all selections from users
-    deleted, _ = Selection.objects.filter(elements).delete()
-    logger.info(f"Deleted {deleted} user selections")
-
-    # Remove all usage from a dataimport from users selection
-    deleted, _ = DataImportElement.objects.filter(elements).delete()
-    logger.info(f"Deleted {deleted} usage from dataimport as selection")
-
-    # Remove all direct usage on a dataimport, without removing the dataimport
-    updated = DataImport.objects.filter(elements).update(element_id=None)
-    logger.info(f"Deleted {updated} usage from dataimport as element")
-
-    # Save children IDs reference before deleting the paths
-    # or the final element deletion query will never get anything to delete
-    children_ids = list(children.values_list('id', flat=True))
-
-    # Remove all paths for these elements
-    deleted, _ = ElementPath.objects.filter(elements).delete()
-    logger.info(f"Deleted {deleted} paths")
-
-    # Finally Remove all children, calling this pre_delete signal recursively
-    # The instance element will be deleted by the parent delete() call
-    deleted = Element.objects.filter(id__in=children_ids)._raw_delete(using=using)
-    logger.info(f"Deleted {deleted} children")
+def pre_delete_handler(sender, instance, *args, **kwargs):
+    raise NotImplementedError('Deleting elements from a QuerySet is not supported. Please call Element.delete on each element.')
diff --git a/arkindex/documents/tests/consumers/test_corpus_consumer.py b/arkindex/documents/tests/consumers/test_corpus_consumer.py
index af38dcf620..619fa828a4 100644
--- a/arkindex/documents/tests/consumers/test_corpus_consumer.py
+++ b/arkindex/documents/tests/consumers/test_corpus_consumer.py
@@ -84,7 +84,7 @@ class TestDeleteCorpus(FixtureTestCase):
         self.corpus.repository = self.repo
         self.corpus.save()
 
-        with self.assertNumQueries(50):
+        with self.assertNumQueries(47):
             CorpusConsumer({}).corpus_delete({'id': str(self.corpus.id)})
 
         # Ensure the command restores the signal receivers
diff --git a/arkindex/documents/tests/test_corpus.py b/arkindex/documents/tests/test_corpus.py
index b816b02bd7..27ea5fa446 100644
--- a/arkindex/documents/tests/test_corpus.py
+++ b/arkindex/documents/tests/test_corpus.py
@@ -3,7 +3,6 @@ from unittest.mock import patch
 
 import mock
 from django.contrib.auth.models import AnonymousUser
-from django.db.models import Model, ProtectedError
 from django.urls import reverse
 from mock import AsyncMock
 from rest_framework import status
@@ -401,22 +400,3 @@ class TestCorpus(FixtureAPITestCase):
                 'batch_size': 1000,
             }
         )
-
-    def test_delete_elements_first(self):
-        """
-        Ensure corpus deletion deletes elements first to prevent a ProtectedError
-        """
-        self.assertEqual(Corpus.objects.count(), 3)
-        self.assertEqual(Element.objects.count(), 29)
-
-        # Use Django's default deletion method and expect a ProtectedError
-        with self.assertRaises(ProtectedError):
-            Model.delete(self.corpus)
-
-        self.assertEqual(Corpus.objects.count(), 3)
-        self.assertEqual(Element.objects.count(), 29)
-
-        # Use the corpus deletion method and it works!
-        self.corpus.delete()
-        self.assertEqual(Corpus.objects.count(), 2)
-        self.assertEqual(Element.objects.count(), 0)
diff --git a/arkindex/documents/tests/test_destroy_elements.py b/arkindex/documents/tests/test_destroy_elements.py
index a4e43612a3..982bd03888 100644
--- a/arkindex/documents/tests/test_destroy_elements.py
+++ b/arkindex/documents/tests/test_destroy_elements.py
@@ -1,7 +1,7 @@
 from django.urls import reverse
 from rest_framework import status
 
-from arkindex.documents.models import Corpus
+from arkindex.documents.models import Corpus, Element
 from arkindex.project.tests import FixtureAPITestCase
 
 
@@ -15,7 +15,8 @@ class TestDestroyElements(FixtureAPITestCase):
         cls.private_corpus = Corpus.objects.create(name='private', public=False)
 
     def test_element_destroy_verified_user(self):
-        response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(self.vol.id)}))
+        with self.assertNumQueries(0):
+            response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(self.vol.id)}))
         self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
 
     def test_element_destroy_acl(self):
@@ -25,7 +26,8 @@ class TestDestroyElements(FixtureAPITestCase):
             type=self.volume_type,
             name='Castle story'
         )
-        response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(castle_story.id)}))
+        with self.assertNumQueries(4):
+            response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(castle_story.id)}))
         self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
         self.assertDictEqual(
             response.json(),
@@ -39,6 +41,19 @@ class TestDestroyElements(FixtureAPITestCase):
             name='Castle story'
         )
         self.assertEqual(self.corpus.elements.filter(id=castle_story.id).exists(), True)
-        response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(castle_story.id)}))
+        with self.assertNumQueries(13):
+            response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(castle_story.id)}))
         self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT)
         self.assertEqual(self.corpus.elements.filter(id=castle_story.id).exists(), False)
+
+    def test_non_empty_element(self):
+        """
+        We can now delete a non-empty element
+        """
+        self.client.force_login(self.user)
+        with self.assertNumQueries(13):
+            response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(self.vol.id)}))
+        self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT)
+
+        with self.assertRaises(Element.DoesNotExist):
+            self.vol.refresh_from_db()
diff --git a/arkindex/documents/tests/test_retrieve_elements.py b/arkindex/documents/tests/test_retrieve_elements.py
index aae92d1a99..b9e593af2f 100644
--- a/arkindex/documents/tests/test_retrieve_elements.py
+++ b/arkindex/documents/tests/test_retrieve_elements.py
@@ -1,7 +1,7 @@
 from django.urls import reverse
 from rest_framework import status
 
-from arkindex.documents.models import Classification, Corpus, DataSource, Element, Entity, MLClass
+from arkindex.documents.models import Classification, Corpus, DataSource, Entity, MLClass
 from arkindex.project.tests import FixtureAPITestCase
 from arkindex_common.enums import EntityType, MetaType
 from arkindex_common.ml_tool import MLToolType
@@ -127,17 +127,6 @@ class TestRetrieveElements(FixtureAPITestCase):
         self.assertEqual(response.status_code, status.HTTP_200_OK)
         self.assertIsNone(response.json()['thumbnail_put_url'])
 
-    def test_non_empty_folder(self):
-        """
-        We can now delete a non-empty folder
-        """
-        self.client.force_login(self.user)
-        response = self.client.delete(reverse('api:element-retrieve', kwargs={'pk': str(self.vol.id)}))
-        self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT)
-
-        with self.assertRaises(Element.DoesNotExist):
-            self.vol.refresh_from_db()
-
     def test_list_element_metadata(self):
         with self.assertNumQueries(6):
             response = self.client.get(reverse('api:element-retrieve', kwargs={'pk': str(self.page.id)}))
-- 
GitLab