diff --git a/arkindex/documents/api/search.py b/arkindex/documents/api/search.py index 783feb4042ca08766b6f993ab924d8a01d8f5f4b..de1446d2a7666b53379990204aecfbc900625a6e 100644 --- a/arkindex/documents/api/search.py +++ b/arkindex/documents/api/search.py @@ -2,9 +2,9 @@ from django.conf import settings from rest_framework.generics import ListAPIView from arkindex.documents.serializers.search import \ ElementSearchResultSerializer, EntitySearchResultSerializer, EntitySearchQuerySerializer -from arkindex.documents.search import search_pages_post, search_acts_post, search_entities_post +from arkindex.documents.search import search_elements_post, search_entities_post from arkindex.project.mixins import SearchAPIMixin -from arkindex.project.elastic import ESPage, ESAct, ESEntity +from arkindex.project.elastic import ESElement, ESEntity from elasticsearch_dsl.query import Q, Nested, FunctionScore from elasticsearch_dsl.function import FieldValueFactor @@ -15,16 +15,31 @@ class SearchAPIView(SearchAPIMixin, ListAPIView): """ -class NestedSearchAPIView(SearchAPIView): - es_document = None +class ElementSearch(SearchAPIView): serializer_class = ElementSearchResultSerializer - - def get_search(self, corpora_ids=None, query=None, type=None, date_lt=None, date_gte=None, min_score=0.0): - assert self.es_document is not None, 'Missing ES document. Set the `es_document` attribute' + openapi_overrides = { + 'operationId': 'SearchElements', + 'security': [], + 'description': 'Get a list of elements with their parents, the total number of transcriptions ' + 'in each element, and a few (not all) of their transcriptions, with their source, ' + 'type, zone and image, for a given query.', + 'tags': ['search'], + } + + def get_search(self, + corpora_ids=None, + query=None, + element_type=None, + transcription_type=None, + date_lt=None, + date_gte=None, + min_score=0.0): assert corpora_ids, 'Must filter by corpora' - search = self.es_document.search() \ - .sort('volumes', '_score') \ + # Note that sorting by parents will not sort properly if there are multiple parents + # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#_sort_mode_option + search = ESElement.search() \ + .sort('parents', '_score') \ .source(fields=['date_range']) \ .filter('terms', corpus=corpora_ids) @@ -37,11 +52,14 @@ class NestedSearchAPIView(SearchAPIView): search = search.filter('range', date_range=date_range) + if element_type: + search = search.filter('match', type=element_type.value) + nested_query = Q('range', transcriptions__score={'gte': min_score}) if query: nested_query &= Q('simple_query_string', query=query, fields=['transcriptions.text']) - if type: - nested_query &= Q('match', transcriptions__type=type.value) + if transcription_type: + nested_query &= Q('match', transcriptions__type=transcription_type.value) search = search.query(Nested( path='transcriptions', @@ -55,26 +73,8 @@ class NestedSearchAPIView(SearchAPIView): return search - -class PageSearch(NestedSearchAPIView): - """ - Search and list transcriptions inside pages - """ - es_document = ESPage - - def post_process(self, *args, **kwargs): - return search_pages_post(*args) - - -class ActSearch(NestedSearchAPIView): - """ - Search for acts containing a specific word - within a specific period - """ - es_document = ESAct - def post_process(self, *args, **kwargs): - return search_acts_post(*args) + return search_elements_post(*args) class EntitySearch(SearchAPIView): diff --git a/arkindex/documents/indexer.py b/arkindex/documents/indexer.py index 33d17baab65cd5a327c67107a8986c1bf297195f..ee62f481f572eb4e7ecb66cc3ec0f39ef64775c7 100644 --- a/arkindex/documents/indexer.py +++ b/arkindex/documents/indexer.py @@ -2,7 +2,7 @@ from django.conf import settings from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk as es_bulk from elasticsearch.exceptions import NotFoundError -from arkindex.project.elastic import ESTranscription, ESAct, ESPage, ESEntity +from arkindex.project.elastic import ESTranscription, ESElement, ESEntity import logging import time import datetime @@ -14,8 +14,7 @@ class Indexer(object): documents = ( ESTranscription, - ESAct, - ESPage, + ESElement, ESEntity, ) diff --git a/arkindex/documents/management/commands/reindex.py b/arkindex/documents/management/commands/reindex.py index 872ed8ff74ac269dd37e0a39ab97090a2941a3c3..67dbff2715e3c9bab711778e12040ad028702bce 100644 --- a/arkindex/documents/management/commands/reindex.py +++ b/arkindex/documents/management/commands/reindex.py @@ -4,7 +4,7 @@ from django.core.management.base import CommandError from ponos.management.base import PonosCommand from arkindex.project.argparse import CorpusArgument, ElementArgument from arkindex.documents.indexer import Indexer -from arkindex.documents.models import Element, ElementType, Act, Transcription, Page, Entity +from arkindex.documents.models import Element, ElementType, Transcription, Entity import logging logging.basicConfig( @@ -14,17 +14,6 @@ logging.basicConfig( logger = logging.getLogger(__name__) -def get_acts(corpus=None, volume=None): - if volume: - queryset = Act.objects.get_descending(volume.id) - elif corpus: - queryset = Act.objects.filter(corpus=corpus) - else: - queryset = Act.objects.all() - - return queryset.prefetch_related('metadatas') - - def get_transcriptions(corpus=None, volume=None): if volume: # Lookup all the transcriptions linked to a volume @@ -37,15 +26,17 @@ def get_transcriptions(corpus=None, volume=None): return Transcription.objects.all() -def get_pages(corpus=None, volume=None): +def get_elements(corpus=None, volume=None): if volume: - return Page.objects.get_descending(volume.id) + queryset = Element.objects.get_descending(volume.id) elif corpus: - return Page.objects.filter(corpus=corpus) + queryset = Element.objects.filter(corpus=corpus) + else: + queryset = Element.objects.all() - return Page.objects.filter( - id__in=Transcription.objects.values_list('element', flat=True).distinct(), - ) + return queryset \ + .filter(type__in=(ElementType.Page, ElementType.Act)) \ + .prefetch_related('metadatas', 'transcriptions') def get_entities(corpus=None, volume=None): @@ -60,21 +51,16 @@ def get_entities(corpus=None, volume=None): class Command(PonosCommand): - help = 'Selectively reindex transcriptions, pages and acts into ElasticSearch' + help = 'Selectively reindex transcriptions, elements and entities into ElasticSearch' docker_image = settings.ARKINDEX_APP_IMAGE base_recipe = settings.PONOS_RECIPE # Setup for reindexation of different Elements index_methods = { - 'acts': { - 'bulk_size': 100, - 'model': Act, - 'items': get_acts, - }, - 'pages': { + 'elements': { 'bulk_size': 100, - 'model': Page, - 'items': get_pages, + 'model': Element, + 'items': get_elements, }, 'transcriptions': { 'bulk_size': 400, @@ -90,19 +76,14 @@ class Command(PonosCommand): def add_arguments(self, parser): super().add_arguments(parser) - parser.add_argument( - '--acts', - help='Reindex acts', - action='store_true', - ) parser.add_argument( '-ts', '--transcriptions', help='Reindex transcriptions', action='store_true', ) parser.add_argument( - '--pages', - help='Reindex pages', + '--elements', + help='Reindex elements', action='store_true', ) parser.add_argument( diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index c53d62f1efac10353a1c701d9558cc18fbe421b5..6bbceb8554bd57d91593a9c0a671410327ce4a18 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -8,7 +8,7 @@ from arkindex_common.enums import TranscriptionType, MetaType, EntityType from arkindex_common.ml_tool import MLToolType from arkindex.project.models import IndexableModel from arkindex.project.fields import ArrayField -from arkindex.project.elastic import ESTranscription, ESPage, ESAct, ESEntity +from arkindex.project.elastic import ESTranscription, ESElement, ESEntity from arkindex.documents.managers import ElementManager, CorpusManager, PageManager, ActManager from arkindex.documents.dates import InterpretedDateMixin import uuid @@ -114,6 +114,7 @@ class Element(IndexableModel): blank=True, ) + es_document = ESElement objects = ElementManager() def get_thumbnail(self): @@ -265,7 +266,6 @@ class Page(Element): """ with folio numbering """ - es_document = ESPage objects = PageManager() class Meta: @@ -306,7 +306,6 @@ class Act(Element): """ A logical subdivision that can span on multiple images via zones """ - es_document = ESAct objects = ActManager() class Meta: diff --git a/arkindex/documents/search.py b/arkindex/documents/search.py index f0d3cce5c0c6b52579b2a91457968592cc5df122..116a3bdf09aeab5899323815035fd1e372da972e 100644 --- a/arkindex/documents/search.py +++ b/arkindex/documents/search.py @@ -1,4 +1,4 @@ -from arkindex.documents.models import Transcription, Act, Page, Element, Entity +from arkindex.documents.models import Transcription, Element, Entity from itertools import chain import uuid @@ -29,20 +29,6 @@ def search_transcriptions_filter_post(data, element_id): ) -def search_pages_post(data): - """ - Search pages containing query - """ - return search_nested_post(Page, data) - - -def search_acts_post(data): - """ - Search acts containing query - """ - return search_nested_post(Act, data) - - def search_entities_post(data): """ Search entities containing query @@ -57,14 +43,10 @@ def search_entities_post(data): return list(filter(None, map(entities.get, entity_ids))) -def search_nested_post(model, data): +def search_elements_post(data): """ - Search a specific element type for a query - Returns a list of `model` instances + Search elements for a query """ - assert isinstance(model, type), "Model argument must be a class" - assert issubclass(model, Element), "Model must be an Element" - elt_ids = list(map(uuid.UUID, (result.meta.id for result in data))) if not elt_ids: return [] @@ -96,7 +78,7 @@ def search_nested_post(model, data): for result in data } - elts = list(model.objects.filter(id__in=elt_ids).prefetch_related('corpus')) + elts = list(Element.objects.filter(id__in=elt_ids).prefetch_related('corpus')) # Preserve the ordering given by ElasticSearch ordered_elts = list(filter(None, map(lambda eid: next((e for e in elts if e.id == eid), None), elt_ids))) diff --git a/arkindex/documents/serializers/search.py b/arkindex/documents/serializers/search.py index 115d474954a1e94ac986ac65f517220afd80900b..91960cecee3c0190f9e7b3cde3b774e264157c50 100644 --- a/arkindex/documents/serializers/search.py +++ b/arkindex/documents/serializers/search.py @@ -20,7 +20,8 @@ class SearchQuerySerializer(serializers.Serializer): score = serializers.FloatField(source='min_score', min_value=0.0, max_value=1.0, default=0.0) date_gte = serializers.CharField(default=None) date_lte = serializers.CharField(source='date_lt', default=None) - type = EnumField(enum=TranscriptionType, default=None) + element_type = EnumField(enum=ElementType, default=None) + transcription_type = EnumField(enum=TranscriptionType, default=None) corpus = serializers.UUIDField(source='corpus_id', default=None) def parse_date(self, raw_date): diff --git a/arkindex/documents/tests/commands/test_reindex.py b/arkindex/documents/tests/commands/test_reindex.py index daa53c3737ad27ef48d61fc40083cb2cd1b9b4a0..97592e95ffd30454d07c6d334993aeb36eeb4698 100644 --- a/arkindex/documents/tests/commands/test_reindex.py +++ b/arkindex/documents/tests/commands/test_reindex.py @@ -2,9 +2,8 @@ from unittest.mock import patch, call from django.core.management import call_command from arkindex_common.enums import MetaType from arkindex.project.tests import FixtureTestCase -from arkindex.project.elastic import ESTranscription, ESAct, ESPage, ESEntity -from arkindex.documents.models import Element, ElementType, Act, Page, Transcription, Entity, EntityType, \ - DataSource, MLToolType +from arkindex.project.elastic import ESTranscription, ESElement, ESEntity +from arkindex.documents.models import Element, ElementType, Transcription, Entity, EntityType, DataSource, MLToolType class TestReindexCommand(FixtureTestCase): @@ -14,9 +13,9 @@ class TestReindexCommand(FixtureTestCase): super().setUpTestData() source = DataSource.objects.create(type=MLToolType.NER, slug='entity', internal=True) cls.indexer_patch = patch('arkindex.documents.management.commands.reindex.Indexer') - cls.vol = Element.objects.get(type=ElementType.Volume, name="Volume 1") + cls.vol = cls.corpus.elements.get(type=ElementType.Volume, name="Volume 1") cls.entity = cls.corpus.entities.create(type=EntityType.Misc, name='Dummy entity', source=source) - page = Page.objects.get(name='Volume 1, page 1r') + page = cls.corpus.elements.get(name='Volume 1, page 1r') page.metadatas.create(name='Dummy metadata', value='Dummy', type=MetaType.Text, entity=cls.entity) def setUp(self): @@ -27,15 +26,15 @@ class TestReindexCommand(FixtureTestCase): super().tearDown() self.indexer_patch.stop() - def _assert_all_acts(self, call_args): + def _assert_all_elements(self, call_args): """ - Helper method to assert run_index is called to reindex all acts + Helper method to assert run_index is called to reindex all elements Required because of self.assertQuerysetEqual, the only way Django has to compare querysets in tests """ (queryset, ), kwargs = call_args self.assertQuerysetEqual( queryset, - map(repr, Act.objects.all()), + map(repr, Element.objects.filter(type__in=(ElementType.Page, ElementType.Act))), ordered=False, ) self.assertDictEqual(kwargs, {'bulk_size': 100}) @@ -52,20 +51,6 @@ class TestReindexCommand(FixtureTestCase): ) self.assertDictEqual(kwargs, {'bulk_size': 400}) - def _assert_all_pages(self, call_args): - """ - Helper method to assert run_index is called to reindex all pages - """ - (queryset, ), kwargs = call_args - self.assertQuerysetEqual( - queryset, - map(repr, Page.objects.filter( - id__in=Transcription.objects.values_list('element', flat=True).distinct(), - )), - ordered=False, - ) - self.assertDictEqual(kwargs, {'bulk_size': 100}) - def _assert_all_transcriptions(self, call_args): """ Helper method to assert run_index is called to reindex all transcriptions @@ -82,21 +67,20 @@ class TestReindexCommand(FixtureTestCase): """ Helper method to assert run_index is called three times to reindex everything """ - self.assertEqual(self.indexer_mock().run_index.call_count, 4) - acts_call, entities_call, pages_call, ts_call = sorted(self.indexer_mock().run_index.call_args_list, key=repr) - self._assert_all_acts(acts_call) + self.assertEqual(self.indexer_mock().run_index.call_count, 3) + elements_call, entities_call, ts_call = sorted(self.indexer_mock().run_index.call_args_list, key=repr) + self._assert_all_elements(elements_call) self._assert_all_entities(entities_call) self._assert_all_transcriptions(ts_call) - self._assert_all_pages(pages_call) - def _assert_volume_acts(self, call_args): + def _assert_volume_elements(self, call_args): """ - Helper method to assert run_index is called to reindex all acts in a volume + Helper method to assert run_index is called to reindex all elements in a volume """ (queryset, ), kwargs = call_args self.assertQuerysetEqual( queryset, - map(repr, Act.objects.get_descending(self.vol.id)), + map(repr, Element.objects.get_descending(self.vol.id).filter(type__in=(ElementType.Page, ElementType.Act))), ordered=False, ) self.assertDictEqual(kwargs, {'bulk_size': 100}) @@ -115,18 +99,6 @@ class TestReindexCommand(FixtureTestCase): ) self.assertDictEqual(kwargs, {'bulk_size': 400}) - def _assert_volume_pages(self, call_args): - """ - Helper method to assert run_index is called to reindex all pages in a volume - """ - (queryset, ), kwargs = call_args - self.assertQuerysetEqual( - queryset, - map(repr, Page.objects.get_descending(self.vol.id)), - ordered=False, - ) - self.assertDictEqual(kwargs, {'bulk_size': 100}) - def _assert_volume_transcriptions(self, call_args): """ Helper method to assert run_index is called to reindex all transcriptions in a volume @@ -145,21 +117,20 @@ class TestReindexCommand(FixtureTestCase): """ Helper method to assert run_index is called three times to reindex a volume """ - self.assertEqual(self.indexer_mock().run_index.call_count, 4) - acts_call, entities_call, pages_call, ts_call = sorted(self.indexer_mock().run_index.call_args_list, key=repr) - self._assert_volume_acts(acts_call) + self.assertEqual(self.indexer_mock().run_index.call_count, 3) + elements_call, entities_call, ts_call = sorted(self.indexer_mock().run_index.call_args_list, key=repr) + self._assert_volume_elements(elements_call) self._assert_volume_entities(entities_call) - self._assert_volume_pages(pages_call) self._assert_volume_transcriptions(ts_call) - def _assert_corpus_acts(self, call_args): + def _assert_corpus_elements(self, call_args): """ - Helper method to assert run_index is called to reindex all acts in a corpus + Helper method to assert run_index is called to reindex all elements in a corpus """ (queryset, ), kwargs = call_args self.assertQuerysetEqual( queryset, - map(repr, Act.objects.filter(corpus=self.corpus)), + map(repr, Element.objects.filter(corpus=self.corpus, type__in=(ElementType.Page, ElementType.Act))), ordered=False, ) self.assertDictEqual(kwargs, {'bulk_size': 100}) @@ -176,18 +147,6 @@ class TestReindexCommand(FixtureTestCase): ) self.assertDictEqual(kwargs, {'bulk_size': 400}) - def _assert_corpus_pages(self, call_args): - """ - Helper method to assert run_index is called to reindex all pages in a corpus - """ - (queryset, ), kwargs = call_args - self.assertQuerysetEqual( - queryset, - map(repr, Page.objects.filter(corpus=self.corpus)), - ordered=False, - ) - self.assertDictEqual(kwargs, {'bulk_size': 100}) - def _assert_corpus_transcriptions(self, call_args): """ Helper method to assert run_index is called to reindex all transcriptions in a corpus @@ -204,11 +163,10 @@ class TestReindexCommand(FixtureTestCase): """ Helper method to assert run_index is called three times to reindex a volume """ - self.assertEqual(self.indexer_mock().run_index.call_count, 4) - acts_call, entities_call, pages_call, ts_call = sorted(self.indexer_mock().run_index.call_args_list, key=repr) - self._assert_corpus_acts(acts_call) + self.assertEqual(self.indexer_mock().run_index.call_count, 3) + elements_call, entities_call, ts_call = sorted(self.indexer_mock().run_index.call_args_list, key=repr) + self._assert_corpus_elements(elements_call) self._assert_corpus_entities(entities_call) - self._assert_corpus_pages(pages_call) self._assert_corpus_transcriptions(ts_call) def test_acts(self): @@ -217,12 +175,12 @@ class TestReindexCommand(FixtureTestCase): """ call_command( 'reindex', - acts=True, + elements=True, ) self.assertEqual(self.indexer_mock().drop_index.call_count, 0) self.assertEqual(self.indexer_mock().setup.call_count, 0) self.assertEqual(self.indexer_mock().run_index.call_count, 1) - self._assert_all_acts(self.indexer_mock().run_index.call_args) + self._assert_all_elements(self.indexer_mock().run_index.call_args) def test_entities(self): """ @@ -237,19 +195,6 @@ class TestReindexCommand(FixtureTestCase): self.assertEqual(self.indexer_mock().run_index.call_count, 1) self._assert_all_entities(self.indexer_mock().run_index.call_args) - def test_pages(self): - """ - Test the reindex command can reindex pages - """ - call_command( - 'reindex', - pages=True, - ) - self.assertEqual(self.indexer_mock().drop_index.call_count, 0) - self.assertEqual(self.indexer_mock().setup.call_count, 0) - self.assertEqual(self.indexer_mock().run_index.call_count, 1) - self._assert_all_pages(self.indexer_mock().run_index.call_args) - def test_transcriptions(self): """ Test the reindex command can reindex transcriptions @@ -306,11 +251,10 @@ class TestReindexCommand(FixtureTestCase): 'reindex', drop=True, ) - self.assertEqual(self.indexer_mock().drop_index.call_count, 4) + self.assertEqual(self.indexer_mock().drop_index.call_count, 3) self.assertCountEqual(self.indexer_mock().drop_index.call_args_list, [ call(ESTranscription), - call(ESAct), - call(ESPage), + call(ESElement), call(ESEntity), ]) self.assertEqual(self.indexer_mock().setup.call_count, 1) diff --git a/arkindex/documents/tests/test_act.py b/arkindex/documents/tests/test_act.py index 2536c958f6476f4946c817bc042c8920a4e225a0..9092e6395efd3a7e38b6dd6a90f9655430c6e626 100644 --- a/arkindex/documents/tests/test_act.py +++ b/arkindex/documents/tests/test_act.py @@ -3,7 +3,7 @@ from django.urls import reverse from rest_framework import status from arkindex_common.enums import MetaType from arkindex.project.tests import FixtureAPITestCase -from arkindex.project.elastic import ESAct +from arkindex.project.elastic import ESElement from arkindex.documents.models import ElementType from arkindex.documents.dates import DateType, InterpretedDate @@ -116,12 +116,12 @@ class TestAct(FixtureAPITestCase): # Test Upper bound get_dates_mock.return_value = [InterpretedDate(1420, 5, type=DateType.Upper), ] - date_range = ESAct.from_model(self.act).to_dict().get('date_range') + date_range = ESElement.from_model(self.act).to_dict().get('date_range') self.assertDictEqual(date_range, {'lt': '1420-05||+1M'}) # Test Lower bound get_dates_mock.return_value = [InterpretedDate(1418, type=DateType.Lower), ] - date_range = ESAct.from_model(self.act).to_dict().get('date_range') + date_range = ESElement.from_model(self.act).to_dict().get('date_range') self.assertDictEqual(date_range, {'gte': '1418'}) # Test with both Lower and Upper bound @@ -129,10 +129,10 @@ class TestAct(FixtureAPITestCase): InterpretedDate(1418, type=DateType.Lower), InterpretedDate(1428, type=DateType.Upper), ] - date_range = ESAct.from_model(self.act).to_dict().get('date_range') + date_range = ESElement.from_model(self.act).to_dict().get('date_range') self.assertDictEqual(date_range, {'gte': '1418', 'lt': '1428||+1y'}) # Test an exact date get_dates_mock.return_value = [InterpretedDate(1666, 2, 3), ] - date_range = ESAct.from_model(self.act).to_dict().get('date_range') + date_range = ESElement.from_model(self.act).to_dict().get('date_range') self.assertDictEqual(date_range, {'gte': '1666-02-03', 'lt': '1666-02-03||+1d'}) diff --git a/arkindex/documents/tests/test_search.py b/arkindex/documents/tests/test_search.py index 98fff455d3298bb7c141162295e859da1b307012..98c2a36110cfcf80183b40ee5c08e903483ace4c 100644 --- a/arkindex/documents/tests/test_search.py +++ b/arkindex/documents/tests/test_search.py @@ -6,7 +6,7 @@ from elasticsearch_dsl.connections import connections from arkindex_common.enums import EntityType from arkindex.project.tests import FixtureAPITestCase from arkindex.project.elastic import ESTranscription -from arkindex.documents.models import Transcription, Act, Page, Element, Corpus, DataSource, MLToolType +from arkindex.documents.models import Transcription, Element, Corpus, DataSource, MLToolType import uuid @@ -70,14 +70,14 @@ class TestSearchAPI(FixtureAPITestCase): } } - def make_nested_hit(self, index, doctype, elt, ts, score=1.0): + def make_element_hit(self, elt, ts, score=1.0): return { "_score": score, - "_type": doctype, + "_type": 'element', "_id": str(elt.id.hex), # TODO test date ranges in a query '_source': {'date_range': []}, - "_index": index, + "_index": 'elements', "inner_hits": { "transcriptions": { "hits": { @@ -89,12 +89,6 @@ class TestSearchAPI(FixtureAPITestCase): } } - def make_act_hit(self, act, ts, score=1.0): - return self.make_nested_hit("acts", "act", act, ts, score) - - def make_page_hit(self, page, ts, score=1.0): - return self.make_nested_hit("pages", "page", page, ts, score) - def make_entity_hit(self, entity): return { "_id": str(entity.id), @@ -102,23 +96,23 @@ class TestSearchAPI(FixtureAPITestCase): "_type": "entity", } - def test_page_search(self): - page = Page.objects.get(name="Volume 1, page 1r") + def test_element_search(self): + elt = Element.objects.get(name="Volume 1, page 1r") ts = Transcription.objects.filter(text="PARIS", zone__image__path='img1') self.es_mock.count.return_value = {'count': 1} self.es_mock.search.return_value = self.build_es_response( - [self.make_page_hit(page, ts), ], + [self.make_element_hit(elt, ts), ], ) - response = self.client.get(reverse('api:page-search'), {'q': "paris"}) + response = self.client.get(reverse('api:element-search'), {'q': "paris"}) self.assertEqual(response.status_code, status.HTTP_200_OK) results = response.json()["results"] self.assertEqual(len(results), 1) result = results[0] - self.assertEqual(result['id'], str(page.id)) + self.assertEqual(result['id'], str(elt.id)) self.assertCountEqual( [t['id'] for t in result['transcriptions']], map(str, ts.values_list('id', flat=True)), @@ -128,8 +122,8 @@ class TestSearchAPI(FixtureAPITestCase): args, kwargs = self.es_mock.search.call_args self.assertTupleEqual(args, ()) self.assertCountEqual(kwargs.keys(), ['body', 'index', 'doc_type']) - self.assertListEqual(kwargs['index'], ['pages']) - self.assertListEqual(kwargs['doc_type'], ['page']) + self.assertListEqual(kwargs['index'], ['elements']) + self.assertListEqual(kwargs['doc_type'], ['element']) self.assertCountEqual(kwargs['body'].keys(), ['_source', 'from', 'size', 'query', 'sort']) self.assertListEqual(kwargs['body']['_source'], ['date_range']) @@ -170,74 +164,6 @@ class TestSearchAPI(FixtureAPITestCase): }}, ]) - def test_act_search(self): - act = Act.objects.get(name='Act 1') - ts = Transcription.objects.filter(text__in=["PARIS", "ROY"], zone__image__path='img1') - - self.es_mock.count.return_value = {'count': 1} - self.es_mock.search.return_value = self.build_es_response( - [self.make_act_hit(act, ts), ], - ) - - response = self.client.get(reverse('api:act-search'), {'q': "paris roy"}) - self.assertEqual(response.status_code, status.HTTP_200_OK) - - results = response.json()["results"] - self.assertEqual(len(results), 1) - result = results[0] - - self.assertEqual(result['id'], str(act.id)) - self.assertCountEqual( - [t['id'] for t in result['transcriptions']], - map(str, ts.values_list('id', flat=True)), - ) - self.assertEqual(result['total_transcriptions'], len(ts)) - - args, kwargs = self.es_mock.search.call_args - self.assertTupleEqual(args, ()) - self.assertCountEqual(kwargs.keys(), ['body', 'index', 'doc_type']) - self.assertListEqual(kwargs['index'], ['acts']) - self.assertListEqual(kwargs['doc_type'], ['act']) - - self.assertCountEqual(kwargs['body'].keys(), ['_source', 'from', 'size', 'query', 'sort']) - self.assertListEqual(kwargs['body']['_source'], ['date_range']) - self.assertEqual(kwargs['body']['from'], 0) - self.assertEqual(kwargs['body']['size'], 1) - - self.assertCountEqual( - kwargs['body']['query']['bool']['filter'][0]['terms']['corpus'], - map(str, Corpus.objects.readable(AnonymousUser()).values_list('id', flat=True)), - ) - - nested = kwargs['body']['query']['bool']['must'][0]['nested'] - self.assertEqual(nested['score_mode'], 'sum') - self.assertEqual(nested['path'], 'transcriptions') - self.assertIn('inner_hits', nested) - - function_score = nested['query']['function_score'] - self.assertListEqual(function_score['functions'], [ - { - "field_value_factor": { - "field": "transcriptions.score", - } - } - ]) - self.assertIsInstance(function_score['query']['bool']['must'], list) - - self.assertTrue(all(len(cond.keys()) == 1 for cond in function_score['query']['bool']['must'])) - conditions = function_score['query']['bool']['must'] - self.assertCountEqual(conditions, [ - {'simple_query_string': { - 'query': 'paris roy', - 'fields': ['transcriptions.text'], - }}, - {'range': { - 'transcriptions.score': { - 'gte': 0.0, - } - }}, - ]) - def test_iiif_transcription_search(self): # Filter to only get transcriptions from volume 1 unfiltered = Transcription.objects.filter(text="PARIS") diff --git a/arkindex/documents/tests/test_search_api.py b/arkindex/documents/tests/test_search_api.py index d066c80be195447b1186063d07ebbf157781bb71..9c33182995391373b00e9414816c021a87fdd583 100644 --- a/arkindex/documents/tests/test_search_api.py +++ b/arkindex/documents/tests/test_search_api.py @@ -19,7 +19,8 @@ class TestSearchApi(FixtureAPITestCase): {'q': 'one two', 'date_lte': '1333'}, {'q': 'one two', 'date_gte': '1333-12'}, {'q': 'one two', 'date_gte': '1333-12-02'}, - {'q': 'one', 'type': 'page'}, + {'q': 'one', 'transcription_type': 'page'}, + {'q': 'one', 'element_type': 'page'}, {'q': 'cat', 'corpus': str(self.corpus.id), 'score': '0.9', 'date_lte': '1333-12-02'}, # Search by date only {'date_gte': '1280', 'date_lte': '1290'}, @@ -33,7 +34,7 @@ class TestSearchApi(FixtureAPITestCase): {'q': 'that', 'score': 'inf'}, {'q': 'one two', 'date_lte': '1450-'}, {'q': 'one two', 'date_lte': '1450-02-30'}, - {'q': 'one', 'type': 'wrongtype'}, + {'q': 'one', 'transcription_type': 'wrongtype'}, {'q': 'cat', 'corpus': 'not_even_an_uuid'}, {'q': 'two', 'date_gte': '1460', 'date_lte': '1450'}, ) @@ -48,13 +49,13 @@ class TestSearchApi(FixtureAPITestCase): correctly handled by search api endpoint """ for params in self.valid_params: - response = self.client.get(reverse('api:act-search'), params) + response = self.client.get(reverse('api:element-search'), params) self.assertEqual(response.status_code, status.HTTP_200_OK, response.json()) for params in self.wrong_params: - response = self.client.get(reverse('api:act-search'), params) + response = self.client.get(reverse('api:element-search'), params) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) for params in self.forbidden_params: - response = self.client.get(reverse('api:act-search'), params) + response = self.client.get(reverse('api:element-search'), params) self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) diff --git a/arkindex/project/api_v1.py b/arkindex/project/api_v1.py index 1a0b9ca4ba3523330d2f5c7a99f4c5daefc1275f..5849a2cf3425c317b3e6e1e139d8dd6d8ff361a5 100644 --- a/arkindex/project/api_v1.py +++ b/arkindex/project/api_v1.py @@ -7,7 +7,7 @@ from arkindex.documents.api.elements import ( ElementTranscriptions, ElementsCreate, ElementRegions, RegionDetails, RegionCreate, ElementNeighbors, ) -from arkindex.documents.api.search import PageSearch, ActSearch, EntitySearch +from arkindex.documents.api.search import ElementSearch, EntitySearch from arkindex.documents.api.ml import ( ClassificationCreate, ClassificationValidate, ClassificationReject, ClassificationBulk, TranscriptionCreate, TranscriptionBulk, PageXmlTranscriptionsImport, @@ -101,8 +101,7 @@ api = [ ), # Search engines - path('pages/', PageSearch.as_view(), name='page-search'), - path('acts/', ActSearch.as_view(), name='act-search'), + path('elements/search/', ElementSearch.as_view(), name='element-search'), path('entity/search/', EntitySearch.as_view(), name='entity-search'), # Edit acts diff --git a/arkindex/project/elastic.py b/arkindex/project/elastic.py index a3721c838b0d78a15aafe06fdcf5603746b0125f..4b538261a5aa29180a6240244681e565a05693ed 100644 --- a/arkindex/project/elastic.py +++ b/arkindex/project/elastic.py @@ -90,59 +90,34 @@ class ESTranscriptionInnerDoc(InnerDoc): class ESElement(Document): + type = Keyword() corpus = Keyword() - volumes = Keyword() + # Used exclusively for sorting + parents = Keyword() transcriptions = Nested(ESTranscriptionInnerDoc) - - @classmethod - def from_model(cls, instance): - from arkindex.documents.models import Element, ElementType - return cls( - meta={'id': instance.id.hex}, - corpus=instance.corpus_id, - volumes=[ - v.name - for v in Element.objects.get_ascending( - instance.id, - type=ElementType.Volume, - ) - ], - transcriptions=list(map( - ESTranscriptionInnerDoc.from_model, - instance.transcriptions.all(), - )), - ) - - -class ESPage(ESElement): - class Meta: - mapping = Mapping('page') - - class Index: - name = 'pages' - - -class ESAct(ESElement): date_range = RawDateRange(format='yyyy||yyyy-MM||yyyy-MM-dd') class Meta: - mapping = Mapping('act') + mapping = Mapping('element') class Index: - name = 'acts' + name = 'elements' @classmethod def from_model(cls, instance): from arkindex.documents.models import Element, ElementType, Transcription - surfaces = Element.objects.get_descending(instance.id).filter(type=ElementType.Surface) - transcriptions = [ - Transcription.objects.filter( - zone__image=s.zone.image, - zone__polygon__in=s.zone.polygon - ) - for s in surfaces - ] + if instance.type == ElementType.Act: + surfaces = Element.objects.get_descending(instance.id).filter(type=ElementType.Surface) + transcriptions = chain(*[ + Transcription.objects.filter( + zone__image=s.zone.image, + zone__polygon__in=s.zone.polygon + ) + for s in surfaces + ]) + else: + transcriptions = instance.transcriptions.all() interpreted_dates = chain(*[md.get_dates() for md in instance.metadatas.all()]) date_range = { @@ -157,16 +132,14 @@ class ESAct(ESElement): return cls( meta={'id': instance.id.hex}, corpus=instance.corpus_id, - volumes=[ - v.name - for v in Element.objects.get_ascending( - instance.id, - type=ElementType.Volume, - ) + type=instance.type.value, + parents=[ + element.name + for element in Element.objects.get_ascending(instance.id) ], transcriptions=list(map( ESTranscriptionInnerDoc.from_model, - chain(*transcriptions), + transcriptions, )), date_range=date_range, ) diff --git a/openapi/patch.yml b/openapi/patch.yml index 42b844446374df1e25768d60fecf1e658ea32659..af51cb86853821b09843ec577552b7984a64e267 100644 --- a/openapi/patch.yml +++ b/openapi/patch.yml @@ -45,17 +45,6 @@ paths: security: [] tags: - elements - /api/v1/acts/: - get: - operationId: SearchActs - description: >- - Get a list of acts with their parent registers or volumes, the total - number of transcriptions found in the act, and a few (not all) of the - transcriptions found inside of each act, with their source, type, - zone and image, for a given search query. - security: [] - tags: - - search /api/v1/classification/bulk/: post: description: >- @@ -847,17 +836,6 @@ paths: schema: {} tags: - ml - /api/v1/pages/: - get: - operationId: SearchPages - description: >- - Get a list of pages with their parent registers or volumes, the total - number of transcriptions found in the page, and a few (not all) of the - transcriptions found inside of each page, with their source, type, - zone and image, for a given search query. - security: [] - tags: - - search /api/v1/region/{id}/: get: description: Retrieve detailed information about a region