From cce65d73abc7faba186ef5fc6be9d3e5c0522ab9 Mon Sep 17 00:00:00 2001
From: Erwan Rouchet <rouchet@teklia.com>
Date: Fri, 2 Dec 2022 16:10:20 +0100
Subject: [PATCH] Add a DISTINCT on initialize_activity

---
 arkindex/process/managers.py                  |  2 +-
 arkindex/process/tests/test_workeractivity.py | 26 +++++++++++++++++++
 .../workeractivity_bulk_insert.sql            |  2 +-
 ...ractivity_bulk_insert_no_configuration.sql |  2 +-
 4 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/arkindex/process/managers.py b/arkindex/process/managers.py
index 83c628646a..81991e0f06 100644
--- a/arkindex/process/managers.py
+++ b/arkindex/process/managers.py
@@ -22,7 +22,7 @@ class ActivityManager(models.Manager):
 
         assert isinstance(state, WorkerActivityState), 'State should be an instance of WorkerActivityState'
 
-        sql, params = elements_qs.values('id').query.sql_with_params()
+        sql, params = elements_qs.distinct().values('id').query.sql_with_params()
         select_params = (worker_version_id, configuration_id, state.value, process_id) + params
 
         # With ON CONFLICT, the target constraint is only optional when the action is DO NOTHING.
diff --git a/arkindex/process/tests/test_workeractivity.py b/arkindex/process/tests/test_workeractivity.py
index 9733c45f58..41243ae4ba 100644
--- a/arkindex/process/tests/test_workeractivity.py
+++ b/arkindex/process/tests/test_workeractivity.py
@@ -64,6 +64,32 @@ class TestWorkerActivity(FixtureTestCase):
         self.assertEqual(elements_qs.count(), 5)
         self.assertEqual(WorkerActivity.objects.filter(state=WorkerActivityState.Started, process=self.process).count(), 5)
 
+    def test_bulk_insert_activity_duplicate_elements(self):
+        """
+        WorkerActivity.bulk_insert should exclude duplicate elements
+        """
+        element_type = self.corpus.types.first()
+        parent1 = self.corpus.elements.create(type=element_type, name='Parent 1')
+        parent2 = self.corpus.elements.create(type=element_type, name='Parent 2')
+        element = self.corpus.elements.create(type=element_type, name='Element')
+        child = self.corpus.elements.create(type=element_type, name='Child')
+        element.add_parent(parent1)
+        element.add_parent(parent2)
+        child.add_parent(element)
+        elements_qs = Element.objects.filter(paths__path__contains=[element.id], name='Child')
+        # `child` has two paths that both contain the ID of `element`, because `element` has two parents,
+        # so filtering on paths__path will duplicate the child
+        self.assertEqual(elements_qs.count(), 2)
+
+        WorkerActivity.objects.bulk_insert(
+            self.worker_version.id,
+            self.process.id,
+            self.configuration.id,
+            elements_qs,
+            state=WorkerActivityState.Started,
+        )
+        self.assertEqual(self.process.activities.filter(state=WorkerActivityState.Started).get().element_id, child.id)
+
     def test_bulk_insert_activity_children(self):
         """
         Bulk insert worker activities for acts
diff --git a/arkindex/sql_validation/workeractivity_bulk_insert.sql b/arkindex/sql_validation/workeractivity_bulk_insert.sql
index 911bb32b58..18bc1c4a2f 100644
--- a/arkindex/sql_validation/workeractivity_bulk_insert.sql
+++ b/arkindex/sql_validation/workeractivity_bulk_insert.sql
@@ -8,7 +8,7 @@ SELECT elt.id,
        current_timestamp,
        current_timestamp
 FROM
-    (SELECT "documents_element"."id"
+    (SELECT DISTINCT "documents_element"."id"
      FROM "documents_element"
      INNER JOIN "documents_elementtype" ON ("documents_element"."type_id" = "documents_elementtype"."id")
      WHERE ("documents_elementtype"."corpus_id" = '{corpus_id}'::uuid
diff --git a/arkindex/sql_validation/workeractivity_bulk_insert_no_configuration.sql b/arkindex/sql_validation/workeractivity_bulk_insert_no_configuration.sql
index a3af27ec29..a10ac8e18b 100644
--- a/arkindex/sql_validation/workeractivity_bulk_insert_no_configuration.sql
+++ b/arkindex/sql_validation/workeractivity_bulk_insert_no_configuration.sql
@@ -8,7 +8,7 @@ SELECT elt.id,
        current_timestamp,
        current_timestamp
 FROM
-    (SELECT "documents_element"."id"
+    (SELECT DISTINCT "documents_element"."id"
      FROM "documents_element"
      INNER JOIN "documents_elementtype" ON ("documents_element"."type_id" = "documents_elementtype"."id")
      WHERE ("documents_elementtype"."corpus_id" = '{corpus_id}'::uuid
-- 
GitLab