From 5b5d8b8033eacf01c073b811c9a65ed959ba22f6 Mon Sep 17 00:00:00 2001 From: Erwan Rouchet <rouchet@teklia.com> Date: Wed, 19 Jan 2022 08:36:08 +0000 Subject: [PATCH] Filter on the corpus ID of ElementTypes --- arkindex/documents/api/elements.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arkindex/documents/api/elements.py b/arkindex/documents/api/elements.py index 2018fa0946..1c9c5cea43 100644 --- a/arkindex/documents/api/elements.py +++ b/arkindex/documents/api/elements.py @@ -571,6 +571,17 @@ class ElementsListBase(CorpusACLMixin, DestroyModelMixin, ListAPIView): if self.type_filter: filters['type'] = self.type_filter elif self.folder_filter is not None: + # When filtering for folder or non-folder elements, using only the type__folder filter + # can cause Postgres to retrieve all the {non-}folder types on every corpus + # This can reach hundreds of types as the database grows, so Postgres can end up using a Hash Join + # to handle joining this large amount of elements and types. + # Since Postgres estimates this to represent a large amount of rows, it might also use multi-processing, + # which has a very high overhead. + # This can be avoided by also filtering on the type's corpus: Postgres will then access the index + # on the type's corpus ID. The query planner's statistics will give it a very low estimation since there + # rarely are a ton of types in a corpus, so Postgres will also use the type_id index on elements, which + # will lower the amount of rows much more quickly, making it stop using multi-processing. + filters['type__corpus'] = self.selected_corpus filters['type__folder'] = self.folder_filter if 'worker_version' in self.clean_params: -- GitLab