diff --git a/arkindex/documents/indexer.py b/arkindex/documents/indexer.py index 3ef0188ad3be6b5fc6abd41d36efd969832cbf44..bee4bd7fc983ac95e51ccec5c374cb2a965a3440 100644 --- a/arkindex/documents/indexer.py +++ b/arkindex/documents/indexer.py @@ -60,12 +60,6 @@ INNER JOIN documents_elementtype elementtype ON (element.type_id = elementtype.i class Indexer: - - # The query yielding all the elements to run on will look for all the child elements of all indexable elements - # The joins can take a very long time, so the query gets split into one to fetch all the indexable elements, - # then one to fetch the child elements of {sql_chunk_size} indexable elements using LIMIT and OFFSET. - sql_chunk_size = 10000 - # Number of elements to load in Python from all of the SQL queries (can generate many more documents) elements_chunk_size = 100 @@ -139,10 +133,15 @@ class Indexer: {"name": "entity_worker", "indexed": True, "required": False, "type": "full_string"} ] - def __init__(self, corpus_id): + def __init__(self, corpus_id, sql_chunk_size=10000): self.corpus_id = corpus_id self.collection_name = f"project-{self.corpus_id}" + # The query yielding all the elements to run on will look for all the child elements of all indexable elements + # The joins can take a very long time, so the query gets split into one to fetch all the indexable elements, + # then one to fetch the child elements of {sql_chunk_size} indexable elements using LIMIT and OFFSET. + self.sql_chunk_size = sql_chunk_size + def setup(self): """ Create collection in Solr diff --git a/arkindex/documents/management/commands/reindex.py b/arkindex/documents/management/commands/reindex.py index 7b5415240f296ac6328786d209138bcca8228fc3..75fdd2c3a055da418b93e1b80e72fd883f42d88a 100644 --- a/arkindex/documents/management/commands/reindex.py +++ b/arkindex/documents/management/commands/reindex.py @@ -29,13 +29,22 @@ class Command(BaseCommand): help="Only setup a collection. Create a collection and fields if they do not exist or update the fields", action="store_true", ) + parser.add_argument( + "--sql-chunk-size", + help="Size of the chunks of parent elements used when retrieving all children of indexable elements from the database.", + type=int, + default=10000, + ) - def handle(self, corpus_id, **options): + def handle(self, corpus_id, sql_chunk_size, **options): if not settings.ARKINDEX_FEATURES["search"]: raise CommandError("Reindexation is not possible if the search feature flag is disabled. " "Consider setting `features.search` to `on` or `true` or `yes` in the YAML " "configuration file, and configuring Solr properly.") + if sql_chunk_size <= 0: + raise CommandError("--sql-chunk-size must be set to a strictly positive integer.") + if options["all"]: corpora = Corpus.objects.filter(indexable=True) elif corpus_id: @@ -47,7 +56,7 @@ class Command(BaseCommand): for corpus in corpora: self.stdout.write(f"Indexing {corpus.name}") - indexer = Indexer(corpus.id) + indexer = Indexer(corpus.id, sql_chunk_size=sql_chunk_size) if options.get("drop"): indexer.drop_index() indexer.setup() diff --git a/arkindex/documents/tests/commands/test_reindex.py b/arkindex/documents/tests/commands/test_reindex.py index a3ce57de6ee76cb65d070b018070d7b0c552d7d4..561a7e0ba94bca647a0446b0a6e5c9b5a35d1d49 100644 --- a/arkindex/documents/tests/commands/test_reindex.py +++ b/arkindex/documents/tests/commands/test_reindex.py @@ -1,4 +1,4 @@ -from unittest.mock import patch +from unittest.mock import call, patch from django.core.management import CommandError, call_command from django.test import override_settings @@ -637,3 +637,19 @@ class TestReindexCommand(FixtureTestCase): call_command("reindex", "--all", "--drop") self.assertEqual(mock_solr.delete_doc_by_query.call_count, 1) self.assertEqual(mock_solr.index.call_count, 1) + + @override_settings(ARKINDEX_FEATURES={"search": True}) + def test_invalid_sql_chunk_size(self): + with self.assertRaises(CommandError) as context: + call_command("reindex", "--all", "--sql-chunk-size", -1) + self.assertEqual( + str(context.exception), + "--sql-chunk-size must be set to a strictly positive integer." + ) + + @override_settings(ARKINDEX_FEATURES={"search": True}) + @patch("arkindex.documents.management.commands.reindex.Indexer") + def test_sql_chunk_size(self, mock_indexer): + call_command("reindex", "-c", self.private_corpus.id, "--sql-chunk-size", 2) + self.assertEqual(mock_indexer.call_count, 1) + self.assertEqual(mock_indexer.call_args, call(self.private_corpus.id, sql_chunk_size=2))