diff --git a/VERSION b/VERSION index ee94dd834b5395f973d3c7992f661d306320aec2..d9255e65396cad391678600614c5f4e5c2e51b1f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.8.3 +0.8.4~dev diff --git a/arkindex/documents/api/iiif.py b/arkindex/documents/api/iiif.py index dbe108a444fbb1a790f6a7c43431b0837aaf1e56..e8359d81003df51a6160f386d710d5b42f910add 100644 --- a/arkindex/documents/api/iiif.py +++ b/arkindex/documents/api/iiif.py @@ -8,7 +8,7 @@ from arkindex.documents.serializers.iiif import \ PageAnnotationListSerializer, PageActAnnotationListSerializer, \ SurfaceAnnotationListSerializer, TranscriptionSearchAnnotationListSerializer from arkindex.documents.search import search_transcriptions_filter_post -from arkindex.project.elastic import ESQuerySet +from arkindex.project.mixins import SearchAPIMixin class VolumeManifest(RetrieveAPIView): @@ -92,30 +92,37 @@ class SurfaceAnnotationList(RetrieveAPIView): return super().get(*args, **kwargs) -class TranscriptionSearchAnnotationList(RetrieveAPIView): +class TranscriptionSearchAnnotationList(SearchAPIMixin, RetrieveAPIView): """ Search for transcriptions inside an element and get a IIIF annotation list """ serializer_class = TranscriptionSearchAnnotationListSerializer + template_path = 'elastic/search_transcriptions.json' + es_index = settings.ES_INDEX_TRANSCRIPTIONS + es_type = Transcription.INDEX_TYPE + post_process = search_transcriptions_filter_post + elt = None + + def get_element(self): + if not self.elt: + self.elt = Element.objects.get( + corpus__in=Corpus.objects.readable(self.request.user), + id=self.kwargs['pk'], + ) + return self.elt + + def get_context(self): + ctx = super().get_context() + ctx['type'] = 'word' + ctx['corpus_id'] = self.get_element().corpus_id + return ctx + + def get_post_process_args(self): + return [self.get_element().id] def get_object(self): - elt = Element.objects.get( - corpus__in=Corpus.objects.readable(self.request.user), - id=self.kwargs['pk'], - ) - return ESQuerySet( - query=ESQuerySet.make_query( - 'elastic/search_transcriptions.json', - ctx={ - 'query': self.request.query_params.get('q'), - 'type': 'word', - 'corpus_id': elt.corpus_id, - 'inner_hits_size': settings.ES_INNER_RESULTS_LIMIT, - }, - ), - es_index=settings.ES_INDEX_TRANSCRIPTIONS, - es_type=Transcription.INDEX_TYPE, - post_process=search_transcriptions_filter_post, - post_process_args=[elt.id] - ) + return self.get_queryset() + + def post_process(self, data, *args): + return search_transcriptions_filter_post(data, *args) diff --git a/arkindex/documents/api/search.py b/arkindex/documents/api/search.py index a57034eb8b1d76a9382def34dc2daa76862a16d6..c7ed2d7b670803893e7b8059aa2353685438c3a3 100644 --- a/arkindex/documents/api/search.py +++ b/arkindex/documents/api/search.py @@ -1,8 +1,8 @@ from django.conf import settings from rest_framework.generics import ListAPIView -from arkindex.documents.models import Transcription, Act -from arkindex.documents.serializers.search import TranscriptionSearchResultSerializer, ActSearchResultSerializer -from arkindex.documents.search import search_transcriptions_post, search_acts_post +from arkindex.documents.models import Page, Act +from arkindex.documents.serializers.search import PageSearchResultSerializer, ActSearchResultSerializer +from arkindex.documents.search import search_pages_post, search_acts_post from arkindex.project.mixins import SearchAPIMixin @@ -10,26 +10,21 @@ class SearchAPIView(SearchAPIMixin, ListAPIView): """ A base class for ES search views """ - template_path = None - es_source = True - es_query = None - es_index = None - es_type = None - es_sort = None -class TranscriptionSearch(SearchAPIView): +class PageSearch(SearchAPIView): """ - Search and list transcriptions, using pagination + Search and list transcriptions inside pages """ - serializer_class = TranscriptionSearchResultSerializer - template_path = 'elastic/search_transcriptions.json' - es_sort = {"score": {"order": "desc", "mode": "max"}} - es_index = settings.ES_INDEX_TRANSCRIPTIONS - es_type = Transcription.INDEX_TYPE + serializer_class = PageSearchResultSerializer + template_path = 'elastic/search_nested.json' + es_source = False + es_sort = ["_score", ] + es_index = settings.ES_INDEX_PAGES + es_type = Page.INDEX_TYPE def post_process(self, *args, **kwargs): - return search_transcriptions_post(*args) + return search_pages_post(*args) class ActSearch(SearchAPIView): @@ -37,8 +32,9 @@ class ActSearch(SearchAPIView): Search for acts containing a specific word """ serializer_class = ActSearchResultSerializer - template_path = 'elastic/search_acts.json' + template_path = 'elastic/search_nested.json' es_source = False + es_sort = ['_score', ] es_index = settings.ES_INDEX_ACTS es_type = Act.INDEX_TYPE diff --git a/arkindex/documents/indexer.py b/arkindex/documents/indexer.py index 37589f08637e852af2163e7732c850adaaa7d2f7..782c0b77f3bc26ad887cf5cc77f6d71543131093 100644 --- a/arkindex/documents/indexer.py +++ b/arkindex/documents/indexer.py @@ -1,8 +1,8 @@ from django.conf import settings from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk as es_bulk -from elasticsearch.exceptions import NotFoundError -from arkindex.documents.models import Act +from elasticsearch.exceptions import NotFoundError, RequestError +from arkindex.documents.models import Act, Page import logging import time import datetime @@ -19,27 +19,54 @@ class Indexer(object): """ Create indexes in ElasticSearch """ - self.elastic.indices.create( - index=settings.ES_INDEX_ACTS, - body={ - "mappings": { - Act.INDEX_TYPE: { - "properties": { - "transcriptions": { - "type": "nested", - "properties": { - "id": {"type": "keyword"}, - "type": {"type": "keyword"}, - "score": {"type": "float"}, - "text": {"type": "text"}, - "corpus": {"type": "keyword"} + try: + self.elastic.indices.create( + index=settings.ES_INDEX_ACTS, + body={ + "mappings": { + Act.INDEX_TYPE: { + "properties": { + "corpus": {"type": "keyword"}, + "transcriptions": { + "type": "nested", + "properties": { + "id": {"type": "keyword"}, + "type": {"type": "keyword"}, + "score": {"type": "float"}, + "text": {"type": "text"}, + } + } + } + } + } + } + ) + except RequestError: # Index already exists + pass + try: + self.elastic.indices.create( + index=settings.ES_INDEX_PAGES, + body={ + "mappings": { + Page.INDEX_TYPE: { + "properties": { + "corpus": {"type": "keyword"}, + "transcriptions": { + "type": "nested", + "properties": { + "id": {"type": "keyword"}, + "type": {"type": "keyword"}, + "score": {"type": "float"}, + "text": {"type": "text"}, + } } } } } } - } - ) + ) + except RequestError: # Index already exists + pass def drop_index(self, index_name): """ diff --git a/arkindex/documents/management/commands/reindex.py b/arkindex/documents/management/commands/reindex.py index bd716da94650c4ec14919a312525b4b90c315443..04cc073969372dd721f5ab95eaa35e2b30d9251d 100644 --- a/arkindex/documents/management/commands/reindex.py +++ b/arkindex/documents/management/commands/reindex.py @@ -3,7 +3,7 @@ from django.core.management.base import BaseCommand from django.conf import settings from arkindex.documents.models import Element, ElementType from arkindex.documents.indexer import Indexer -from arkindex.documents.tasks import reindex_acts, reindex_transcriptions +from arkindex.documents.tasks import reindex_acts, reindex_transcriptions, reindex_pages from celery import chain import logging @@ -15,27 +15,32 @@ logger = logging.getLogger(__name__) class Command(BaseCommand): - help = 'Selectively reindex transcriptions and acts into ElasticSearch' + help = 'Selectively reindex transcriptions, pages and acts into ElasticSearch' def add_arguments(self, parser): parser.add_argument( '--acts', help='Reindex acts', - action='store_true' + action='store_true', ) parser.add_argument( '-ts', '--transcriptions', help='Reindex transcriptions', - action='store_true' + action='store_true', + ) + parser.add_argument( + '--pages', + help='Reindex pages', + action='store_true', ) parser.add_argument( '--volume', - help='Restrict reindexing to a specific volume' + help='Restrict reindexing to a specific volume', ) parser.add_argument( '--drop', help="Drop the existing indexes before reindexing", - action='store_true' + action='store_true', ) def handle(self, *args, **options): @@ -52,7 +57,7 @@ class Command(BaseCommand): if options['volume'] is not None: volume = Element.objects.get(type=ElementType.Volume, name__icontains=options['volume']) - if options['transcriptions'] or not options['acts']: + if options['transcriptions']: if options['drop']: indexer.drop_index(settings.ES_INDEX_TRANSCRIPTIONS) if volume: @@ -60,7 +65,7 @@ class Command(BaseCommand): else: tasks.append(reindex_transcriptions.si()) - if options['acts'] or not options['transcriptions']: + if options['acts']: if options['drop']: indexer.drop_index(settings.ES_INDEX_ACTS) indexer.setup() @@ -69,5 +74,14 @@ class Command(BaseCommand): else: tasks.append(reindex_acts.si()) + if options['pages']: + if options['drop']: + indexer.drop_index(settings.ES_INDEX_PAGES) + indexer.setup() + if volume: + tasks.append(reindex_pages.si(volume_id=volume.id)) + else: + tasks.append(reindex_pages.si()) + task = chain(*tasks).delay(async=False) print("Task started with ID {}".format(task.id)) diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 23522e15db32592bbbc786231bfc4b4d44daed84..c7ade222a7cd12734064f8ec6c9c351b4f044f07 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -282,6 +282,8 @@ class Page(Element): """ with folio numbering """ + INDEX_TYPE = 'page' + folio = models.CharField(max_length=250) # Machine learning classes @@ -371,6 +373,23 @@ class Page(Element): out.append(self.direction.value) return ' '.join(out) + def build_search_index(self): + """ + Structure indexed into ElasticSearch + """ + return { + 'corpus': self.corpus_id, + 'transcriptions': [ + { + 'id': t.id, + 'type': t.type.value, + 'score': t.score, + 'text': t.text, + } + for t in self.transcriptions.all() + ] + } + def classify(self): ''' Use a machine learning worker to classify the page @@ -425,13 +444,13 @@ class Act(Element): for s in surfaces ] return { + 'corpus': self.corpus_id, 'transcriptions': [ { 'id': t.id, 'type': t.type.value, 'score': t.score, 'text': t.text, - 'corpus': self.corpus_id, } for sublist in transcriptions for t in sublist ] diff --git a/arkindex/documents/search.py b/arkindex/documents/search.py index e02646877c5c1f6a8724715a64066a82a9b71c92..2555339613d3f5851183e706b707d7834cca329d 100644 --- a/arkindex/documents/search.py +++ b/arkindex/documents/search.py @@ -1,4 +1,4 @@ -from arkindex.documents.models import Transcription, Act, Element +from arkindex.documents.models import Transcription, Act, Page, Element from itertools import chain import uuid @@ -22,16 +22,41 @@ def search_transcriptions_post(data): return ts +def search_transcriptions_filter_post(data, element_id): + if not isinstance(element_id, uuid.UUID): + element_id = uuid.UUID(element_id) + return filter( + lambda t: element_id in [p.id for p in chain(*t.parent_paths)], + search_transcriptions_post(data) + ) + + +def search_pages_post(data): + """ + Search pages containing query + """ + return search_nested_post(Page, data) + + def search_acts_post(data): """ Search acts containing query - Returns (acts list, total number of acts) """ + return search_nested_post(Act, data) + + +def search_nested_post(model, data): + """ + Search a specific element type for a query + Returns a list of `model` instances + """ + assert isinstance(model, type), "Model argument must be a class" + assert issubclass(model, Element), "Model must be an Element" results = data['hits']['hits'] - act_ids = [uuid.UUID(r['_id']) for r in results] - act_scores = {uuid.UUID(r['_id']): r['_score'] for r in results} - if not act_ids: + elt_ids = [uuid.UUID(r['_id']) for r in results] + elt_scores = {uuid.UUID(r['_id']): r['_score'] for r in results} + if not elt_ids: return tr_ids = [ hit['_source']['id'] @@ -48,28 +73,19 @@ def search_acts_post(data): for t in Transcription.objects.filter(id__in=tr_ids).prefetch_related('zone__image__server') } - acts = Act.objects.filter(id__in=act_ids).prefetch_related('corpus') - acts_tr_ids = { + elts = model.objects.filter(id__in=elt_ids).prefetch_related('corpus') + elts_tr_ids = { uuid.UUID(result['_id']): [ uuid.UUID(hit['_source']['id']) for hit in result['inner_hits']['transcriptions']['hits']['hits'] ] for result in results } - all_paths = Element.objects.get_ascendings_paths(*act_ids) + all_paths = Element.objects.get_ascendings_paths(*elt_ids) - for act in acts: - act.transcriptions_results = [transcriptions[tid] for tid in acts_tr_ids[act.id]] - act.total_transcriptions = tr_totals[act.id] - act.parent_paths = all_paths.get(act.id, []) + for elt in elts: + elt.transcriptions_results = [transcriptions[tid] for tid in elts_tr_ids[elt.id]] + elt.total_transcriptions = tr_totals[elt.id] + elt.parent_paths = all_paths.get(elt.id, []) - return sorted(acts, key=lambda a: act_scores[a.id], reverse=True) - - -def search_transcriptions_filter_post(data, element_id): - if not isinstance(element_id, uuid.UUID): - element_id = uuid.UUID(element_id) - return filter( - lambda t: element_id in [p.id for p in chain(*t.parent_paths)], - search_transcriptions_post(data) - ) + return sorted(elts, key=lambda e: elt_scores[e.id], reverse=True) diff --git a/arkindex/documents/serializers/search.py b/arkindex/documents/serializers/search.py index 7b7209b05d0918f2b0f4d01f3ab8ca41b1646ec0..a04029edfcdb3f8ea367d051bd68da1cedd201c0 100644 --- a/arkindex/documents/serializers/search.py +++ b/arkindex/documents/serializers/search.py @@ -1,34 +1,34 @@ from rest_framework import serializers -from arkindex.documents.models import Transcription, TranscriptionType, Act +from arkindex.documents.models import Act, Page from arkindex.documents.serializers.light import CorpusLightSerializer from arkindex.documents.serializers.elements import ElementLightSerializer from arkindex.documents.serializers.transcriptions import TranscriptionSerializer -from arkindex.images.serializers import ZoneSerializer -from arkindex.project.serializer_fields import EnumField, ViewerURLField +from arkindex.project.serializer_fields import ViewerURLField -class TranscriptionSearchResultSerializer(serializers.ModelSerializer): +class PageSearchResultSerializer(serializers.ModelSerializer): """ - Link between objects & their search indexation + A page search result with nested transcriptions """ - type = EnumField(TranscriptionType) - zone = ZoneSerializer() - parents = serializers.ListField( + name = serializers.CharField(source='display_name') + transcriptions = TranscriptionSerializer(many=True, source='transcriptions_results') + total_transcriptions = serializers.IntegerField() + parent_paths = serializers.ListField( child=serializers.ListField( child=ElementLightSerializer() ), - source='parent_paths', ) + corpus = CorpusLightSerializer() class Meta: - model = Transcription + model = Page fields = ( 'id', - 'type', - 'text', - 'score', - 'zone', - 'parents', + 'name', + 'transcriptions', + 'total_transcriptions', + 'parent_paths', + 'corpus', ) diff --git a/arkindex/documents/tasks.py b/arkindex/documents/tasks.py index 9dd5929b3f4f5ebf43314a0047fce011730cba11..b7634fac7cc4bde73c1088f61245352cb3a95483 100644 --- a/arkindex/documents/tasks.py +++ b/arkindex/documents/tasks.py @@ -2,7 +2,7 @@ from celery import shared_task, group, chain from celery_once import QueueOnce from celery.utils.log import get_task_logger from django.conf import settings -from arkindex.documents.models import Element, Act, Transcription, ElementType, Corpus +from arkindex.documents.models import Element, Act, Transcription, Page, ElementType, Corpus from arkindex.documents.indexer import Indexer from arkindex.documents.importer import URLManifestsImporter, LocalManifestsImporter from arkindex.documents.surface import SurfaceImporter @@ -43,6 +43,19 @@ def reindex_transcriptions(bulk_size=400, volume_id=None): indexer.run_index(settings.ES_INDEX_TRANSCRIPTIONS, Transcription.INDEX_TYPE, transcriptions, bulk_size=bulk_size) +@shared_task(base=QueueOnce, once={'graceful': True}) +def reindex_pages(bulk_size=100, volume_id=None): + ''' + Reindex all pages + ''' + indexer = Indexer() + if volume_id: + pages = Page.objects.get_descending(volume_id) + else: + pages = Page.objects.filter(id__in=Transcription.objects.values_list('element', flat=True).distinct()) + indexer.run_index(settings.ES_INDEX_PAGES, Page.INDEX_TYPE, pages, bulk_size=bulk_size) + + @shared_task def import_manifest(path, server_ids=[], corpus_id=None, offline=False, annotations=False, volume_name=None): """ diff --git a/arkindex/documents/tests/test_search.py b/arkindex/documents/tests/test_search.py index 77c7944f0d180c9542e15c95c3c0e6cc61f8b313..336a8628402f032d3f748d2568497ec9d848ba53 100644 --- a/arkindex/documents/tests/test_search.py +++ b/arkindex/documents/tests/test_search.py @@ -1,5 +1,5 @@ from arkindex.project.tests import FixtureAPITestCase -from arkindex.documents.models import Transcription, Act, Element, Corpus +from arkindex.documents.models import Transcription, Act, Page, Element, Corpus from django.urls import reverse from django.contrib.auth.models import AnonymousUser from rest_framework import status @@ -59,12 +59,12 @@ class TestSearchAPI(FixtureAPITestCase): } } - def make_act_hit(self, act, ts, score=1.0): + def make_nested_hit(self, index, doctype, elt, ts, score=1.0): return { "_score": score, - "_type": Act.INDEX_TYPE, - "_id": str(act.id.hex), - "_index": "acts", + "_type": doctype, + "_id": str(elt.id.hex), + "_index": index, "inner_hits": { "transcriptions": { "hits": { @@ -76,50 +76,75 @@ class TestSearchAPI(FixtureAPITestCase): } } - def test_transcription_search(self): - expected = Transcription.objects.filter(text="PARIS") + def make_act_hit(self, act, ts, score=1.0): + return self.make_nested_hit("acts", Act.INDEX_TYPE, act, ts, score) + + def make_page_hit(self, page, ts, score=1.0): + return self.make_nested_hit("pages", Page.INDEX_TYPE, page, ts, score) - self.es_mock().count.return_value = {'count': len(expected)} + def test_page_search(self): + page = Page.objects.get(name="Volume 1, page 1r") + ts = Transcription.objects.filter(text="PARIS", zone__image__path='img1') + + self.es_mock().count.return_value = {'count': 1} self.es_mock().search.return_value = self.build_es_response( - list(map(self.make_transcription_hit, expected)), + [self.make_page_hit(page, ts), ], ) - response = self.client.get(reverse('api:transcription-search'), {'q': "paris"}) + response = self.client.get(reverse('api:page-search'), {'q': "paris"}) self.assertEqual(response.status_code, status.HTTP_200_OK) results = response.json()["results"] + self.assertEqual(len(results), 1) + result = results[0] + self.assertEqual(result['id'], str(page.id)) self.assertCountEqual( - [r['id'] for r in results], - map(str, expected.values_list('id', flat=True)), + [t['id'] for t in result['transcriptions']], + map(str, ts.values_list('id', flat=True)), ) + self.assertEqual(result['total_transcriptions'], len(ts)) args, kwargs = self.es_mock().search.call_args self.assertTupleEqual(args, ()) self.assertCountEqual(kwargs.keys(), ['body', 'index', 'doc_type']) - self.assertEqual(kwargs['index'], 'transcriptions') - self.assertEqual(kwargs['doc_type'], Transcription.INDEX_TYPE) + self.assertEqual(kwargs['index'], 'pages') + self.assertEqual(kwargs['doc_type'], Page.INDEX_TYPE) self.assertCountEqual(kwargs['body'].keys(), ['_source', 'from', 'size', 'query', 'sort', 'aggs']) - self.assertEqual(kwargs['body']['_source'], True) + self.assertEqual(kwargs['body']['_source'], False) self.assertEqual(kwargs['body']['from'], 0) - self.assertEqual(kwargs['body']['size'], len(expected)) - - self.assertIsInstance(kwargs['body']['query']['bool']['must'], list) - self.assertTrue(all(len(cond.keys()) == 1 for cond in kwargs['body']['query']['bool']['must'])) - conditions = { - list(cond.keys())[0]: list(cond.values())[0] - for cond in kwargs['body']['query']['bool']['must'] - } + self.assertEqual(kwargs['body']['size'], 1) - self.assertCountEqual(conditions.keys(), ['match', 'terms', 'range']) - self.assertEqual(conditions['match']['text'], 'paris') - self.assertEqual(conditions['range']['score']['gte'], 0.5) self.assertCountEqual( - conditions['terms']['corpus.keyword'], + kwargs['body']['query']['bool']['filter'][0]['terms']['corpus'], map(str, Corpus.objects.readable(AnonymousUser()).values_list('id', flat=True)), ) + nested = kwargs['body']['query']['bool']['must'][0]['nested'] + self.assertEqual(nested['score_mode'], 'sum') + self.assertEqual(nested['path'], 'transcriptions') + self.assertIn('inner_hits', nested) + + function_score = nested['query']['function_score'] + self.assertListEqual(function_score['functions'], [ + { + "field_value_factor": { + "field": "transcriptions.score", + } + } + ]) + self.assertIsInstance(function_score['query']['bool']['must'], list) + + self.assertTrue(all(len(cond.keys()) == 1 for cond in function_score['query']['bool']['must'])) + conditions = { + list(cond.keys())[0]: list(cond.values())[0] + for cond in function_score['query']['bool']['must'] + } + self.assertCountEqual(conditions.keys(), ['match', 'range']) + self.assertEqual(conditions['match']['transcriptions.text'], 'paris') + self.assertEqual(conditions['range']['transcriptions.score']['gte'], 0.5) + def test_act_search(self): act = Act.objects.get(number="1") ts = Transcription.objects.filter(text__in=["PARIS", "ROY"], zone__image__path='img1') @@ -154,11 +179,17 @@ class TestSearchAPI(FixtureAPITestCase): self.assertEqual(kwargs['body']['from'], 0) self.assertEqual(kwargs['body']['size'], 1) - self.assertEqual(kwargs['body']['query']['nested']['score_mode'], 'sum') - self.assertEqual(kwargs['body']['query']['nested']['path'], 'transcriptions') - self.assertIn('inner_hits', kwargs['body']['query']['nested']) + self.assertCountEqual( + kwargs['body']['query']['bool']['filter'][0]['terms']['corpus'], + map(str, Corpus.objects.readable(AnonymousUser()).values_list('id', flat=True)), + ) + + nested = kwargs['body']['query']['bool']['must'][0]['nested'] + self.assertEqual(nested['score_mode'], 'sum') + self.assertEqual(nested['path'], 'transcriptions') + self.assertIn('inner_hits', nested) - function_score = kwargs['body']['query']['nested']['query']['function_score'] + function_score = nested['query']['function_score'] self.assertListEqual(function_score['functions'], [ { "field_value_factor": { @@ -173,13 +204,9 @@ class TestSearchAPI(FixtureAPITestCase): list(cond.keys())[0]: list(cond.values())[0] for cond in function_score['query']['bool']['must'] } - self.assertCountEqual(conditions.keys(), ['match', 'terms', 'range']) + self.assertCountEqual(conditions.keys(), ['match', 'range']) self.assertEqual(conditions['match']['transcriptions.text'], 'paris roy') self.assertEqual(conditions['range']['transcriptions.score']['gte'], 0.5) - self.assertCountEqual( - conditions['terms']['transcriptions.corpus'], - map(str, Corpus.objects.readable(AnonymousUser()).values_list('id', flat=True)), - ) def test_iiif_transcription_search(self): # Filter to only get transcriptions from volume 1 diff --git a/arkindex/documents/urls.py b/arkindex/documents/urls.py index 15f8b553b20542405a8e37aa280c1892780f3d25..7b22be18b9dd48796a69382b2f6696aa395947fd 100644 --- a/arkindex/documents/urls.py +++ b/arkindex/documents/urls.py @@ -1,12 +1,11 @@ from django.conf import settings from django.conf.urls import url, include -from arkindex.documents.views import \ - VolumesList, VolumePages, VolumeActs, TranscriptionsSearch, ActsSearch +from arkindex.documents.views import VolumesList, VolumePages, VolumeActs, PagesSearch, ActsSearch, PageDetails urlpatterns = [ - url(r'^transcriptions/$', TranscriptionsSearch.as_view(), name='transcriptions'), + url(r'^pages/$', PagesSearch.as_view(), name='pages'), url(r'^acts/$', ActsSearch.as_view(), name='acts'), # Volumes @@ -15,6 +14,10 @@ urlpatterns = [ url(r'^volume/(?P<pk>[\w\-]+)/acts/$', VolumeActs.as_view(), name='volume-acts'), url(r'^$', VolumesList.as_view(), name='volumes'), + + # Page info + url(r'^page/(?P<pk>[\w\-]+)/$', + PageDetails.as_view(), name='page-details'), ] if 'debug_toolbar' in settings.INSTALLED_APPS: diff --git a/arkindex/documents/views.py b/arkindex/documents/views.py index 2f24e12300dc63a980a3913636a9822a751899a4..cd6d29ab93d91c5c515bdbd21b81325fec38f5a0 100644 --- a/arkindex/documents/views.py +++ b/arkindex/documents/views.py @@ -1,7 +1,7 @@ from django.views.generic import TemplateView, DetailView, View from django.views.generic.detail import SingleObjectMixin from django.http import HttpResponse -from arkindex.documents.models import Element, ElementType, Act, Page +from arkindex.documents.models import Element, ElementType, Act, Page, Corpus import io import csv @@ -18,21 +18,25 @@ class VolumePages(DetailView): Show volume pages using Vue JS + API """ template_name = 'documents/volume.page.html' - queryset = Element.objects.filter(type=ElementType.Volume) context_object_name = 'volume' + def get_queryset(self): + return Element.objects.filter(type=ElementType.Volume, corpus__in=Corpus.objects.readable(self.request.user)) + class VolumeActs(DetailView): template_name = 'documents/volume.act.html' - queryset = Element.objects.filter(type=ElementType.Volume) context_object_name = 'volume' + def get_queryset(self): + return Element.objects.filter(type=ElementType.Volume, corpus__in=Corpus.objects.readable(self.request.user)) + -class TranscriptionsSearch(TemplateView): +class PagesSearch(TemplateView): """ Search pages using Vue JS + API """ - template_name = 'documents/transcriptions.html' + template_name = 'documents/pages.html' class ActsSearch(TemplateView): @@ -42,11 +46,23 @@ class ActsSearch(TemplateView): template_name = 'documents/acts.html' +class PageDetails(DetailView): + """ + View a page's details + """ + template_name = 'documents/page.html' + context_object_name = 'page' + + def get_queryset(self): + return Page.objects.filter(corpus__in=Corpus.objects.readable(self.request.user)) + + class DumpActs(SingleObjectMixin, View): """ Dump acts and linked surfaces data for a given volume in CSV format """ - queryset = Element.objects.filter(type=ElementType.Volume) + def get_queryset(self): + return Element.objects.filter(type=ElementType.Volume, corpus__in=Corpus.objects.readable(self.request.user)) def get(self, request, *args, **kwargs): volume = self.get_object() diff --git a/arkindex/project/api_v1.py b/arkindex/project/api_v1.py index 0f3194e9544555b2f7e167a26164ad5e89307c52..e49f47af73e2bb28a73e8371cef8642b710989d4 100644 --- a/arkindex/project/api_v1.py +++ b/arkindex/project/api_v1.py @@ -4,7 +4,7 @@ from django.views.generic.base import RedirectView from arkindex.documents.api.elements import \ ElementsList, RelatedElementsList, ElementRetrieve, ElementPages, ElementSurfaces, CorpusList, CorpusPages, \ ActEdit, PageDetails, SurfaceDetails -from arkindex.documents.api.search import TranscriptionSearch, ActSearch +from arkindex.documents.api.search import PageSearch, ActSearch from arkindex.documents.api.transcriptions import TranscriptionCreate, TranscriptionBulk from arkindex.documents.api.iiif import \ VolumeManifest, ActManifest, PageAnnotationList, PageActAnnotationList, SurfaceAnnotationList, \ @@ -69,7 +69,7 @@ api = [ name='surface-manifest'), # Search transcriptions - url(r'^transcriptions/$', TranscriptionSearch.as_view(), name='transcription-search'), + url(r'^pages/$', PageSearch.as_view(), name='page-search'), url(r'^acts/$', ActSearch.as_view(), name='act-search'), # Edit acts diff --git a/arkindex/project/mixins.py b/arkindex/project/mixins.py index b1fb9ff660f4f737475585cf19b8a82baeb4f91e..e9591a3c6fc85ad20371737d77c160f669356f4e 100644 --- a/arkindex/project/mixins.py +++ b/arkindex/project/mixins.py @@ -30,6 +30,13 @@ class CorpusACLMixin(object): class SearchAPIMixin(CorpusACLMixin): + template_path = None + es_source = True + es_query = None + es_index = None + es_type = None + es_sort = None + post_process_args = None def get(self, request, *args, **kwargs): q = request.query_params.get('q') @@ -41,6 +48,7 @@ class SearchAPIMixin(CorpusACLMixin): context = { 'query': elasticsearch_escape(self.request.query_params['q']), 'type': self.request.query_params.get('type'), + 'min_score': self.request.query_params.get('score'), 'inner_hits_size': settings.ES_INNER_RESULTS_LIMIT, } if 'corpus' in self.request.query_params: @@ -77,6 +85,12 @@ class SearchAPIMixin(CorpusACLMixin): def get_type(self): return self.es_type or '_doc' + def get_post_process(self): + return self.post_process + + def get_post_process_args(self): + return self.post_process_args or [] + def get_queryset(self): return ESQuerySet( _source=self.es_source, @@ -84,7 +98,8 @@ class SearchAPIMixin(CorpusACLMixin): sort=self.get_sort(), es_index=self.get_index(), es_type=self.get_type(), - post_process=self.post_process, + post_process=self.get_post_process(), + post_process_args=self.get_post_process_args(), ) def post_process(self, *args): diff --git a/arkindex/project/settings.py b/arkindex/project/settings.py index 7fbb5911c8bfaaadc1698e7b41caaec708da91c9..cd51243b6e2e6c8877b6919086ed74b0184ac97d 100644 --- a/arkindex/project/settings.py +++ b/arkindex/project/settings.py @@ -220,6 +220,7 @@ ES_RESULTS_LIMIT = 10000 ES_INNER_RESULTS_LIMIT = 6 ES_INDEX_TRANSCRIPTIONS = 'transcriptions' ES_INDEX_ACTS = 'acts' +ES_INDEX_PAGES = 'pages' # Silent logger for elasticsearch logging.getLogger('elasticsearch').setLevel(logging.WARNING) diff --git a/arkindex/templates/base.html b/arkindex/templates/base.html index fe38b71aa716af47f41f9147469858a9efbd50a2..12b70807fa7890719cb748eda308d9ed482b0d02 100644 --- a/arkindex/templates/base.html +++ b/arkindex/templates/base.html @@ -27,7 +27,7 @@ Search </a> <div class="navbar-dropdown"> - <a class="navbar-item" href="{% url 'transcriptions' %}"> + <a class="navbar-item" href="{% url 'pages' %}"> Pages </a> <a class="navbar-item" href="{% url 'acts' %}"> diff --git a/arkindex/templates/documents/page.html b/arkindex/templates/documents/page.html new file mode 100644 index 0000000000000000000000000000000000000000..1432281247bd708d4301478c2b223a96ca6ed4c0 --- /dev/null +++ b/arkindex/templates/documents/page.html @@ -0,0 +1,7 @@ +{% extends 'base.html' %} + +{% block content %} +<div id="app"> + <Page-Details id="{{ page.id }}" /> +</div> +{% endblock %} diff --git a/arkindex/templates/documents/transcriptions.html b/arkindex/templates/documents/pages.html similarity index 75% rename from arkindex/templates/documents/transcriptions.html rename to arkindex/templates/documents/pages.html index c8aacaa698ed9baeb54e29a2b7156792f092e5ad..57b75c17143d436de8fe19fe222460610722a042 100644 --- a/arkindex/templates/documents/transcriptions.html +++ b/arkindex/templates/documents/pages.html @@ -2,6 +2,6 @@ {% block content %} <div id="app"> - <Transcription-Search /> + <Page-Search /> </div> {% endblock %} diff --git a/arkindex/templates/elastic/search_acts.json b/arkindex/templates/elastic/search_acts.json deleted file mode 100644 index b56f12f437290b39163928ef5c4ce157d50032db..0000000000000000000000000000000000000000 --- a/arkindex/templates/elastic/search_acts.json +++ /dev/null @@ -1,64 +0,0 @@ -{ - "nested": { - "path": "transcriptions", - "inner_hits": { - {% if inner_hits_size %} - "size": {{ inner_hits_size }} - {% endif %} - }, - "score_mode": "sum", - "query": { - "function_score": { - "query": { - "bool": { - "must": [ - { - "range": { - "transcriptions.score": { - "gte": {{ min_score|default:"0.5" }} - } - } - }, - {% if corpus_id %} - { - "match": { - "transcriptions.corpus": "{{ corpus_id }}" - } - }, - {% elif corpora_ids %} - { - "terms": { - "transcriptions.corpus": [ - {% for corpus_id in corpora_ids %} - "{{ corpus_id }}"{% if forloop.revcounter0 != 0 %},{% endif %} - {% endfor %} - ] - } - }, - {% endif %} - {% if type %} - { - "match": { - "transcriptions.type": "{{ type }}" - } - }, - {% endif %} - { - "match": { - "transcriptions.text": "{{ query|safe }}" - } - } - ] - } - }, - "functions": [ - { - "field_value_factor": { - "field": "transcriptions.score" - } - } - ] - } - } - } -} diff --git a/arkindex/templates/elastic/search_nested.json b/arkindex/templates/elastic/search_nested.json new file mode 100644 index 0000000000000000000000000000000000000000..12964f3238bcde612eb6428ccdb10935c933b770 --- /dev/null +++ b/arkindex/templates/elastic/search_nested.json @@ -0,0 +1,72 @@ +{ + "bool": { + "filter": [ + {% if corpus_id %} + { + "match": { + "corpus": "{{ corpus_id }}" + } + } + {% elif corpora_ids %} + { + "terms": { + "corpus": [ + {% for corpus_id in corpora_ids %} + "{{ corpus_id }}"{% if forloop.revcounter0 != 0 %},{% endif %} + {% endfor %} + ] + } + } + {% endif %} + ], + "must": [ + { + "nested": { + "path": "transcriptions", + "inner_hits": { + {% if inner_hits_size %} + "size": {{ inner_hits_size }} + {% endif %} + }, + "score_mode": "sum", + "query": { + "function_score": { + "query": { + "bool": { + "must": [ + { + "range": { + "transcriptions.score": { + "gte": {{ min_score|default:"0.5" }} + } + } + }, + {% if type %} + { + "match": { + "transcriptions.type": "{{ type }}" + } + }, + {% endif %} + { + "match": { + "transcriptions.text": "{{ query|safe }}" + } + } + ] + } + }, + "functions": [ + { + "field_value_factor": { + "field": "transcriptions.score" + } + } + ] + } + } + } + } + ] + } +} diff --git a/arkindex/templates/home.html b/arkindex/templates/home.html index 5e3907ba69004f226843d03509c1f9579af27291..9ca27fabe3dd0e2308c8a13cfccf4707a5968b2b 100644 --- a/arkindex/templates/home.html +++ b/arkindex/templates/home.html @@ -7,7 +7,7 @@ View available <a href="{% url 'volumes' %}">volumes</a> </li> <li> - Search through <a href="{% url 'transcriptions' %}">transcriptions</a> + Search through <a href="{% url 'pages' %}">pages</a> </li> <li> Search through <a href="{% url 'acts' %}">acts</a>