From 26161ca2291510894f27d82efb0790408431ef6d Mon Sep 17 00:00:00 2001 From: Bastien Abadie <bastien@nextcairn.com> Date: Wed, 25 Oct 2017 17:00:56 +0200 Subject: [PATCH] Search API with images+indexes, sorted by best index first --- src/documents/api.py | 19 ++++++++++++ src/documents/models.py | 58 ++++++++++++++++++++++++++++++++++-- src/documents/serializers.py | 23 ++++++++++++++ src/horae/api.py | 4 +++ 4 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 src/documents/api.py create mode 100644 src/documents/serializers.py diff --git a/src/documents/api.py b/src/documents/api.py new file mode 100644 index 0000000000..439147109a --- /dev/null +++ b/src/documents/api.py @@ -0,0 +1,19 @@ +from rest_framework.generics import ListAPIView +from documents.serializers import ImageSearchSerializer +from documents.models import Image + +class ImagesSearch(ListAPIView): + """ + Search through stored images + """ + serializer_class = ImageSearchSerializer + + def get_queryset(self): + """ + Support local memory cache to avoid multiple hits on ES + """ + if hasattr(self, '_cache'): + return self._cache + + self._cache = Image.search(self.kwargs['query']) + return self._cache diff --git a/src/documents/models.py b/src/documents/models.py index 9468849530..8d36dc5963 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -4,10 +4,14 @@ from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk as es_bulk, scan as es_scan import requests import hashlib +import itertools import json import base64 import uuid +ES_DOC_INDEX_TYPE = 'word' +ES_INDEX = 'volume_xxx' + class Image(models.Model): """ A document image @@ -21,6 +25,56 @@ class Image(models.Model): def __str__(self): return '{} - {}'.format(self.id, self.iiif_url) + @staticmethod + def search(query): + """ + Search into ElasticSearch + """ + query = { + 'bool' : { + 'must': [ + { + 'match' : { + 'word': query, + } + }, + ], + } + } + elastic = Elasticsearch(settings.ELASTIC_SEARCH_HOSTS) + results = es_scan( + client=elastic, + index=ES_INDEX, + query={'query': query}, + doc_type=ES_DOC_INDEX_TYPE, + ) + + # Group by images, as an in-memory dict + # dict keys are image ids + grouper = lambda r : r['_source']['image'] + results = sorted(results, key=grouper) + indexes = { + image_id : [i['_source'] for i in indexes] # in memory copy + for image_id, indexes in itertools.groupby(results, grouper) + } + + # Load all images from results + images = Image.objects.filter(pk__in=indexes.keys()) + + # Map images with their indexes + for image in images: + image.word_results = indexes[image.id.hex] + + # Sort images by best index score + def _best_index(image): + return max([ + index['score'] + for index in image.word_results + ]) + images = sorted(images, key=_best_index, reverse=True) + + return images + def check_source(self): """ Check the image is available through this url @@ -63,8 +117,8 @@ class Image(models.Model): # Build raw ElasticSearch insert actions = [{ - '_index': 'volume_xxx', # TODO: use volume ? - '_type': 'word', + '_index': ES_INDEX, + '_type': ES_DOC_INDEX_TYPE, '_source': index_data, '_id': index_id, } for index_id, index_data in indexes] diff --git a/src/documents/serializers.py b/src/documents/serializers.py new file mode 100644 index 0000000000..60098e0939 --- /dev/null +++ b/src/documents/serializers.py @@ -0,0 +1,23 @@ +from rest_framework import serializers +from documents.models import Image + + +class WordSerializer(serializers.Serializer): + word = serializers.CharField() + score = serializers.CharField() + box = serializers.ListField() + +class ImageSearchSerializer(serializers.ModelSerializer): + """ + Serialises an image with its index + issued from a search + """ + word_results = WordSerializer(many=True) + + class Meta: + model = Image + fields = ( + 'id', + 'iiif_url', + 'word_results', + ) diff --git a/src/horae/api.py b/src/horae/api.py index ba1c226c18..546f91921f 100644 --- a/src/horae/api.py +++ b/src/horae/api.py @@ -1,5 +1,9 @@ from django.conf.urls import url, include +from documents.api import ImagesSearch urlpatterns = [ + + # Images search using indexes + url(r'^images/search/(?P<query>[\w\s]+)$', ImagesSearch.as_view(), name='images-search'), ] -- GitLab