From f381702cc59f2127972fd8bb7a42ced035ca0566 Mon Sep 17 00:00:00 2001
From: Erwan Rouchet <rouchet@teklia.com>
Date: Tue, 18 May 2021 12:39:03 +0000
Subject: [PATCH] Optimize ListCorpusWorkerVersions

---
 arkindex/dataimport/api.py                |  34 +++++--
 arkindex/dataimport/tests/test_workers.py | 114 +++++++++++++++-------
 2 files changed, 105 insertions(+), 43 deletions(-)

diff --git a/arkindex/dataimport/api.py b/arkindex/dataimport/api.py
index b8d053f61b..edbeed5ced 100644
--- a/arkindex/dataimport/api.py
+++ b/arkindex/dataimport/api.py
@@ -80,6 +80,7 @@ from arkindex.project.mixins import (
     SelectionMixin,
     WorkerACLMixin,
 )
+from arkindex.project.pagination import CustomCursorPagination
 from arkindex.project.permissions import IsInternalUser, IsVerified, IsVerifiedOrReadOnly
 from arkindex.project.tools import RTrimChr
 from arkindex.users.models import OAuthCredentials, Role
@@ -907,18 +908,27 @@ class WorkerVersionList(WorkerACLMixin, ListCreateAPIView):
 
 @extend_schema_view(
     get=extend_schema(
-        tags=['ml'],
         operation_id='ListCorpusWorkerVersions',
+        tags=['ml'],
         description=(
             'List worker versions used by elements of a given corpus.\n\n'
             'No check is performed on workers access level in order to allow any user to see versions.'
         ),
+        parameters=[
+            OpenApiParameter(
+                'with_element_count',
+                type=bool,
+                default=False,
+                description='Include element counts in the response.',
+            )
+        ],
     )
 )
 class CorpusWorkerVersionList(CorpusACLMixin, ListAPIView):
     """
     List worker versions used by elements of a given corpus.
     """
+    pagination_class = CustomCursorPagination
     permission_classes = (IsVerifiedOrReadOnly, )
     serializer_class = WorkerVersionSerializer
     # For OpenAPI type discovery
@@ -929,12 +939,24 @@ class CorpusWorkerVersionList(CorpusACLMixin, ListAPIView):
 
     def get_queryset(self):
         corpus = self.get_corpus()
-        return WorkerVersion.objects \
+        queryset = WorkerVersion.objects \
             .filter(elements__corpus_id=corpus.id) \
-            .select_related('revision__repo', 'worker', 'worker__repository') \
-            .prefetch_related('revision__refs', 'revision__versions') \
-            .order_by('-revision__created') \
-            .annotate(element_count=Count('id'))
+            .prefetch_related(
+                'revision__repo',
+                'revision__refs',
+                'revision__versions',
+                'worker__repository',
+            ) \
+            .order_by('-id')
+
+        if self.request.query_params.get('with_element_count', '').lower() in ('true', '1'):
+            queryset = queryset.annotate(element_count=Count('id'))
+        else:
+            # The Count() causes Django to add a GROUP BY, and without a count we need a DISTINCT
+            # because filtering on `elements` causes worker versions to be duplicated.
+            queryset = queryset.distinct()
+
+        return queryset
 
 
 @extend_schema(tags=['repos'])
diff --git a/arkindex/dataimport/tests/test_workers.py b/arkindex/dataimport/tests/test_workers.py
index abc232f04d..8a04e4b11c 100644
--- a/arkindex/dataimport/tests/test_workers.py
+++ b/arkindex/dataimport/tests/test_workers.py
@@ -842,45 +842,49 @@ class TestWorkersWorkerVersions(FixtureAPITestCase):
         self.assertEqual(self.version_1.docker_command, 'mysupercommand')
         self.version_1.configuration = {"test": "test1"}
 
-    def _assert_corpus_worker_version_list(self, response):
-        self.assertDictEqual(response.json(), {
-            'count': 1,
-            'next': None,
-            'number': 1,
-            'previous': None,
-            'results': [{
-                'id': str(self.version_1.id),
-                'configuration': {'test': 42},
-                'revision': {
-                    'id': str(self.version_1.revision_id),
-                    'hash': '1337',
-                    'author': 'Test user',
-                    'message': 'My w0rk3r',
-                    'created': '2020-02-02T01:23:45.678000Z',
-                    'commit_url': 'http://my_repo.fake/workers/worker/commit/1337',
-                    'refs': []
-                },
-                'docker_image': str(self.version_1.docker_image_id),
-                'docker_image_iid': None,
-                'docker_image_name': self.version_1.docker_image_name,
-                'state': 'available',
-                'element_count': 9,
-                'worker': {
-                    'id': str(self.worker_1.id),
-                    'name': self.worker_1.name,
-                    'type': self.worker_1.type,
-                    'slug': self.worker_1.slug,
-                }
-            }]
-        })
+    def _serialize_worker_version(self, version, element_count=False):
+        data = {
+            'id': str(version.id),
+            'configuration': {'test': 42},
+            'revision': {
+                'id': str(version.revision_id),
+                'hash': '1337',
+                'author': 'Test user',
+                'message': 'My w0rk3r',
+                'created': '2020-02-02T01:23:45.678000Z',
+                'commit_url': 'http://my_repo.fake/workers/worker/commit/1337',
+                'refs': []
+            },
+            'docker_image': str(version.docker_image_id),
+            'docker_image_iid': None,
+            'docker_image_name': version.docker_image_name,
+            'state': 'available',
+            'worker': {
+                'id': str(version.worker.id),
+                'name': version.worker.name,
+                'type': version.worker.type,
+                'slug': version.worker.slug,
+            }
+        }
+        if element_count:
+            data['element_count'] = version.elements.filter(corpus=self.corpus).count()
+        return data
 
     def test_corpus_worker_version_no_login(self):
         self.corpus.elements.filter(type__slug='word').update(worker_version=self.version_1)
 
-        with self.assertNumQueries(7):
+        with self.assertNumQueries(8):
             response = self.client.get(reverse('api:corpus-versions', kwargs={'pk': self.corpus.id}))
             self.assertEqual(response.status_code, status.HTTP_200_OK)
-            self._assert_corpus_worker_version_list(response)
+
+        self.assertDictEqual(response.json(), {
+            'count': None,
+            'previous': None,
+            'next': None,
+            'results': [
+                self._serialize_worker_version(self.version_1)
+            ]
+        })
 
     def test_corpus_worker_version_not_verified(self):
         self.user.verified_email = False
@@ -888,16 +892,52 @@ class TestWorkersWorkerVersions(FixtureAPITestCase):
         self.client.force_login(self.user)
         self.corpus.elements.filter(type__slug='word').update(worker_version=self.version_1)
 
-        with self.assertNumQueries(11):
+        with self.assertNumQueries(12):
             response = self.client.get(reverse('api:corpus-versions', kwargs={'pk': self.corpus.id}))
             self.assertEqual(response.status_code, status.HTTP_200_OK)
-            self._assert_corpus_worker_version_list(response)
+
+        self.assertDictEqual(response.json(), {
+            'count': None,
+            'previous': None,
+            'next': None,
+            'results': [
+                self._serialize_worker_version(self.version_1)
+            ]
+        })
 
     def test_corpus_worker_version_list(self):
         self.client.force_login(self.user)
         self.corpus.elements.filter(type__slug='word').update(worker_version=self.version_1)
 
-        with self.assertNumQueries(11):
+        with self.assertNumQueries(12):
             response = self.client.get(reverse('api:corpus-versions', kwargs={'pk': self.corpus.id}))
             self.assertEqual(response.status_code, status.HTTP_200_OK)
-            self._assert_corpus_worker_version_list(response)
+
+        self.assertDictEqual(response.json(), {
+            'count': None,
+            'previous': None,
+            'next': None,
+            'results': [
+                self._serialize_worker_version(self.version_1)
+            ]
+        })
+
+    def test_corpus_worker_version_list_with_element_count(self):
+        self.client.force_login(self.user)
+        self.corpus.elements.filter(type__slug='word').update(worker_version=self.version_1)
+
+        with self.assertNumQueries(12):
+            response = self.client.get(
+                reverse('api:corpus-versions', kwargs={'pk': self.corpus.id}),
+                {'with_element_count': 'true'}
+            )
+            self.assertEqual(response.status_code, status.HTTP_200_OK)
+
+        self.assertDictEqual(response.json(), {
+            'count': None,
+            'previous': None,
+            'next': None,
+            'results': [
+                self._serialize_worker_version(self.version_1, element_count=True)
+            ]
+        })
-- 
GitLab