diff --git a/arkindex/documents/api/search.py b/arkindex/documents/api/search.py index de95bd753848bc57935dd2658b9c8ec7bd1fcb33..a4cd2c5c0ae40073655f90a49670d2cc154f4056 100644 --- a/arkindex/documents/api/search.py +++ b/arkindex/documents/api/search.py @@ -68,6 +68,13 @@ solr = SolrClient(settings.SOLR_API_URL) default={}, required=False ), + OpenApiParameter( + 'sort', + description='Criterion on which to sort search results', + enum=['relevance', 'element_name'], + default='relevance', + required=False, + ), OpenApiParameter( 'only_facets', type=bool, @@ -89,10 +96,18 @@ class CorpusSearch(CorpusACLMixin, APIView): facets_query = " AND ".join([f'{key}:"{value}"' for key, value in facets.items()]) return f'({sources_query}) AND ({facets_query})' if facets_query else sources_query - def solr_search(self, collection_name, query, page): + def solr_search(self, collection_name, query, sort, page): + sort_params = { + 'relevance': 'score desc', + 'element_name': 'element_text asc', + } + if sort not in sort_params: + raise ValueError('Unknown sort parameter') + return solr.query(collection_name, { 'q': query, 'start': (page - 1) * settings.SOLR_PAGINATION_SIZE, + 'sort': sort_params[sort], 'rows': settings.SOLR_PAGINATION_SIZE, 'facet': True, 'facet.field': [ @@ -136,15 +151,15 @@ class CorpusSearch(CorpusACLMixin, APIView): params['sources'] = request.query_params.getlist('sources') + request.query_params.getlist('sources[]') serializer = CorpusSearchQuerySerializer(data=params) serializer.is_valid(raise_exception=True) - q, sources, only_facets, page = map( + q, sources, only_facets, sort, page = map( lambda key: serializer.validated_data.pop(key), - ('query', 'sources', 'only_facets', 'page') + ('query', 'sources', 'only_facets', 'sort', 'page') ) # Paginated Solr search query = self.build_solr_query(q, sources, serializer.validated_data) try: - results = self.solr_search(collection_name, query, page) + results = self.solr_search(collection_name, query, sort, page) except SolrError: raise ValidationError([f'Query "{q}" is not valid.']) diff --git a/arkindex/documents/indexer.py b/arkindex/documents/indexer.py index 08272c6c198fb961df22d290837c10ace9b75213..b838eecf5a6c355069f84db3212dfbcc53699f15 100644 --- a/arkindex/documents/indexer.py +++ b/arkindex/documents/indexer.py @@ -84,6 +84,16 @@ class Indexer(object): 'filters': [{'class': 'solr.LowerCaseFilterFactory'}] } }, + # Same as string, but the field also stores the raw text before analysis + # so that sorting can be done on the entire field, and not just each token. + { + 'name': 'sortable_string', + 'class': 'solr.SortableTextField', + 'analyzer': { + 'tokenizer': {'class': 'solr.ClassicTokenizerFactory'}, + 'filters': [{'class': 'solr.LowerCaseFilterFactory'}] + } + }, # Updated full_string analyzer to support facets with full text { 'name': 'full_string', @@ -101,7 +111,7 @@ class Indexer(object): {'name': 'parent_type', 'indexed': False, 'required': True, 'type': 'full_string'}, # Element fields {'name': 'element_id', 'indexed': False, 'required': True, 'type': 'uuid'}, - {'name': 'element_text', 'indexed': True, 'required': True, 'type': 'string'}, + {'name': 'element_text', 'indexed': True, 'required': True, 'type': 'sortable_string'}, {'name': 'element_type', 'indexed': True, 'required': True, 'type': 'full_string'}, {'name': 'element_worker', 'indexed': True, 'required': False, 'type': 'full_string'}, {'name': 'element_image', 'indexed': False, 'required': False, 'type': 'full_string'}, diff --git a/arkindex/documents/management/commands/reindex.py b/arkindex/documents/management/commands/reindex.py index e795be6ed5682ea871764185b18dfb9dcb44908d..aafa67ce3b56c55ba1c1678696ce0ac7fde84d06 100644 --- a/arkindex/documents/management/commands/reindex.py +++ b/arkindex/documents/management/commands/reindex.py @@ -14,7 +14,10 @@ class Command(BaseCommand): def add_arguments(self, parser): super().add_arguments(parser) parser.add_argument( - "corpus_id", help="UUID of an existing corpus to reindex", type=uuid.UUID + "--all", help="Reindex all corpus", action="store_true", + ) + parser.add_argument( + "-c", "--corpus-id", help="UUID of an existing corpus to reindex", type=uuid.UUID ) parser.add_argument( '--drop', @@ -33,18 +36,21 @@ class Command(BaseCommand): 'Consider setting `features.search` to `on` or `true` or `yes` in the YAML ' 'configuration file, and configuring Solr properly.') - try: - corpus = Corpus.objects.get(id=corpus_id) - except Corpus.DoesNotExist: - raise CommandError(f'Corpus {corpus_id} does not exist') - - if not corpus.indexable: - raise CommandError(f'Corpus {corpus.name} is not indexable') - - indexer = Indexer(corpus.id) - indexer.setup() - if options.get('setup'): - return - if options.get('drop'): - indexer.drop_index() - indexer.index() + if options["all"]: + corpora = Corpus.objects.filter(indexable=True) + elif corpus_id: + corpora = Corpus.objects.filter(id=corpus_id, indexable=True) + if not corpora.exists(): + raise CommandError(f'Corpus {corpus_id} does not exist or is not indexable') + else: + raise CommandError("Either spcify a corpus ID with --corpus-id or index all corpus through --all") + + for corpus in corpora: + self.stdout.write(f"Indexing {corpus.name}") + indexer = Indexer(corpus.id) + indexer.setup() + if options.get('setup'): + return + if options.get('drop'): + indexer.drop_index() + indexer.index() diff --git a/arkindex/documents/serializers/search.py b/arkindex/documents/serializers/search.py index 67995b08b18e4b3161bcf64dd65add9ea117f207..dfc0d1bbaccff666c036431a16cb29c719dcd9e6 100644 --- a/arkindex/documents/serializers/search.py +++ b/arkindex/documents/serializers/search.py @@ -87,6 +87,10 @@ class CorpusSearchQuerySerializer(serializers.Serializer): ], default={'element', 'transcription', 'metadata', 'entity'}) only_facets = serializers.BooleanField(default=False) page = serializers.IntegerField(default=1, min_value=1) + sort = serializers.ChoiceField( + choices=['relevance', 'element_name'], + default='relevance', + ) # Facets element_type = serializers.CharField(required=False) diff --git a/arkindex/documents/tests/commands/test_reindex.py b/arkindex/documents/tests/commands/test_reindex.py index bb5c82b01756be6eed606894f543f3c58cadba07..776386c32787e6db4ad394b31cb1173f3f4e815d 100644 --- a/arkindex/documents/tests/commands/test_reindex.py +++ b/arkindex/documents/tests/commands/test_reindex.py @@ -41,7 +41,7 @@ class TestReindexCommand(FixtureTestCase): mock_solr.collections.exists.return_value = False mock_solr.schema.does_field_exist.return_value = False - call_command('reindex', self.private_corpus.id, setup=True) + call_command('reindex', '-c', self.private_corpus.id, setup=True) self.assertEqual(mock_solr.collections.create.call_count, 1) self.assertEqual(mock_solr.schema.create_field.call_count, len(indexer.solr_fields)) self.assertEqual(mock_solr.schema.replace_field.call_count, 0) @@ -57,7 +57,7 @@ class TestReindexCommand(FixtureTestCase): self.page_type.indexable = False self.page_type.save() - call_command('reindex', self.private_corpus.id) + call_command('reindex', '-c', self.private_corpus.id) self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) self.assertEqual(mock_solr.index.call_count, 0) @@ -70,7 +70,7 @@ class TestReindexCommand(FixtureTestCase): word = self.private_corpus.elements.create(name='A word', type=self.word_type) word.add_parent(self.line) - call_command('reindex', self.private_corpus.id) + call_command('reindex', '-c', self.private_corpus.id) self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) self.assertEqual(mock_solr.index.call_count, 1) (index_name, documents), kwargs = mock_solr.index.call_args @@ -129,7 +129,7 @@ class TestReindexCommand(FixtureTestCase): worker_version=self.worker_version, ) - call_command('reindex', self.private_corpus.id) + call_command('reindex', '-c', self.private_corpus.id) self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) self.assertEqual(mock_solr.index.call_count, 1) (index_name, documents), kwargs = mock_solr.index.call_args @@ -206,7 +206,7 @@ class TestReindexCommand(FixtureTestCase): ml_class=self.private_corpus.ml_classes.create(name='Dog') ) - call_command('reindex', self.private_corpus.id) + call_command('reindex', '-c', self.private_corpus.id) self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) self.assertEqual(mock_solr.index.call_count, 1) (index_name, documents), kwargs = mock_solr.index.call_args @@ -287,7 +287,7 @@ class TestReindexCommand(FixtureTestCase): worker_version=self.worker_version, ) - call_command('reindex', self.private_corpus.id) + call_command('reindex', '-c', self.private_corpus.id) self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) self.assertEqual(mock_solr.index.call_count, 1) (index_name, documents), kwargs = mock_solr.index.call_args @@ -374,7 +374,7 @@ class TestReindexCommand(FixtureTestCase): entity=entity_2 ) - call_command('reindex', self.private_corpus.id) + call_command('reindex', '-c', self.private_corpus.id) self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) self.assertEqual(mock_solr.index.call_count, 1) (index_name, documents), kwargs = mock_solr.index.call_args @@ -472,7 +472,7 @@ class TestReindexCommand(FixtureTestCase): self.line.worker_version = self.worker_version self.line.save() - call_command('reindex', self.private_corpus.id) + call_command('reindex', '-c', self.private_corpus.id) self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) self.assertEqual(mock_solr.index.call_count, 1) (index_name, documents), kwargs = mock_solr.index.call_args @@ -513,7 +513,7 @@ class TestReindexCommand(FixtureTestCase): new_page = self.private_corpus.elements.create(name='New page', type=self.page_type) self.line.add_parent(new_page) - call_command('reindex', self.private_corpus.id) + call_command('reindex', '-c', self.private_corpus.id) self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) self.assertEqual(mock_solr.index.call_count, 1) (index_name, documents), kwargs = mock_solr.index.call_args @@ -575,7 +575,7 @@ class TestReindexCommand(FixtureTestCase): self.page_type.indexable = False self.page_type.save() - call_command('reindex', self.private_corpus.id, drop=True) + call_command('reindex', '-c', self.private_corpus.id, drop=True) self.assertEqual(mock_solr.delete_doc_by_query.call_count, 1) (index_name, query), kwargs = mock_solr.delete_doc_by_query.call_args self.assertEqual(index_name, f'project-{self.private_corpus.id}') @@ -586,10 +586,10 @@ class TestReindexCommand(FixtureTestCase): def test_corpus_not_found(self): corpus_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' with self.assertRaises(CommandError) as context: - call_command('reindex', corpus_id) + call_command('reindex', '-c', corpus_id) self.assertEqual( str(context.exception), - f'Corpus {corpus_id} does not exist' + f'Corpus {corpus_id} does not exist or is not indexable' ) @override_settings(ARKINDEX_FEATURES={'search': True}) @@ -598,20 +598,30 @@ class TestReindexCommand(FixtureTestCase): self.private_corpus.save() with self.assertRaises(CommandError) as context: - call_command('reindex', self.private_corpus.id) + call_command('reindex', '-c', self.private_corpus.id) self.assertEqual( str(context.exception), - f'Corpus {self.private_corpus.name} is not indexable' + f'Corpus {self.private_corpus.id} does not exist or is not indexable' ) @override_settings(ARKINDEX_FEATURES={'search': False}) def test_no_search(self): with self.assertRaises(CommandError) as context: - call_command('reindex', self.private_corpus.id) + call_command('reindex', '-c', self.private_corpus.id) self.assertEqual( str(context.exception), 'Reindexation is not possible if the search feature flag is disabled. ' 'Consider setting `features.search` to `on` or `true` or `yes` in the YAML ' 'configuration file, and configuring Solr properly.' ) + + @override_settings(ARKINDEX_FEATURES={'search': True}) + @patch('arkindex.documents.indexer.solr') + def test_index_all_corpus(self, mock_solr): + """ + Test the reindex command for all corpus + """ + call_command('reindex', '--all', '--drop') + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 1) + self.assertEqual(mock_solr.index.call_count, 1) diff --git a/arkindex/documents/tests/test_search_api.py b/arkindex/documents/tests/test_search_api.py index e6e814fde8936c90609763ce1ce3d63511fea5f7..7e2997f86b3adb98bd44de453f17351e82a6fba2 100644 --- a/arkindex/documents/tests/test_search_api.py +++ b/arkindex/documents/tests/test_search_api.py @@ -179,6 +179,7 @@ class TestSearchApi(FixtureAPITestCase): self.assertDictEqual(args, { 'start': 0, 'rows': 20, + 'sort': 'score desc', 'facet': True, 'facet.field': [ 'element_type', @@ -269,3 +270,46 @@ class TestSearchApi(FixtureAPITestCase): # Check Solr call (_, args), _ = mock_solr.query.call_args self.assertEqual(args['start'], 20) + + @override_settings(ARKINDEX_FEATURES={'search': True}) + @patch('arkindex.documents.api.search.solr') + def test_search_sort_element_name(self, mock_solr): + mock_solr.collections.exists.return_value = True + solr_response = self.build_solr_response() + mock_solr.query.return_value = solr_response + + with self.assertNumQueries(1): + response = self.client.get( + reverse('api:corpus-search', kwargs={'pk': self.corpus.id}), + {'query': 'test', 'sort': 'element_name'} + ) + self.assertEqual(response.status_code, status.HTTP_200_OK) + + self.assertDictEqual(response.json(), { + 'count': 0, + 'number': 1, + 'previous': None, + 'next': None, + 'results': [], + 'facets': solr_response.get_facets() + }) + + (_, args), _ = mock_solr.query.call_args + self.assertEqual(args['sort'], 'element_text asc') + + @override_settings(ARKINDEX_FEATURES={'search': True}) + @patch('arkindex.documents.api.search.solr') + def test_search_sort_unknown(self, mock_solr): + mock_solr.collections.exists.return_value = True + + with self.assertNumQueries(1): + response = self.client.get( + reverse('api:corpus-search', kwargs={'pk': self.corpus.id}), + {'query': 'test', 'sort': 'element_atomic_number'} + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + self.assertDictEqual(response.json(), { + 'sort': ['"element_atomic_number" is not a valid choice.'] + }) + self.assertFalse(mock_solr.query.called)