Skip to content
Snippets Groups Projects
Commit 26161ca2 authored by Bastien Abadie's avatar Bastien Abadie
Browse files

Search API with images+indexes, sorted by best index first

parent d94722e9
No related branches found
No related tags found
No related merge requests found
from rest_framework.generics import ListAPIView
from documents.serializers import ImageSearchSerializer
from documents.models import Image
class ImagesSearch(ListAPIView):
"""
Search through stored images
"""
serializer_class = ImageSearchSerializer
def get_queryset(self):
"""
Support local memory cache to avoid multiple hits on ES
"""
if hasattr(self, '_cache'):
return self._cache
self._cache = Image.search(self.kwargs['query'])
return self._cache
......@@ -4,10 +4,14 @@ from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk as es_bulk, scan as es_scan
import requests
import hashlib
import itertools
import json
import base64
import uuid
ES_DOC_INDEX_TYPE = 'word'
ES_INDEX = 'volume_xxx'
class Image(models.Model):
"""
A document image
......@@ -21,6 +25,56 @@ class Image(models.Model):
def __str__(self):
return '{} - {}'.format(self.id, self.iiif_url)
@staticmethod
def search(query):
"""
Search into ElasticSearch
"""
query = {
'bool' : {
'must': [
{
'match' : {
'word': query,
}
},
],
}
}
elastic = Elasticsearch(settings.ELASTIC_SEARCH_HOSTS)
results = es_scan(
client=elastic,
index=ES_INDEX,
query={'query': query},
doc_type=ES_DOC_INDEX_TYPE,
)
# Group by images, as an in-memory dict
# dict keys are image ids
grouper = lambda r : r['_source']['image']
results = sorted(results, key=grouper)
indexes = {
image_id : [i['_source'] for i in indexes] # in memory copy
for image_id, indexes in itertools.groupby(results, grouper)
}
# Load all images from results
images = Image.objects.filter(pk__in=indexes.keys())
# Map images with their indexes
for image in images:
image.word_results = indexes[image.id.hex]
# Sort images by best index score
def _best_index(image):
return max([
index['score']
for index in image.word_results
])
images = sorted(images, key=_best_index, reverse=True)
return images
def check_source(self):
"""
Check the image is available through this url
......@@ -63,8 +117,8 @@ class Image(models.Model):
# Build raw ElasticSearch insert
actions = [{
'_index': 'volume_xxx', # TODO: use volume ?
'_type': 'word',
'_index': ES_INDEX,
'_type': ES_DOC_INDEX_TYPE,
'_source': index_data,
'_id': index_id,
} for index_id, index_data in indexes]
......
from rest_framework import serializers
from documents.models import Image
class WordSerializer(serializers.Serializer):
word = serializers.CharField()
score = serializers.CharField()
box = serializers.ListField()
class ImageSearchSerializer(serializers.ModelSerializer):
"""
Serialises an image with its index
issued from a search
"""
word_results = WordSerializer(many=True)
class Meta:
model = Image
fields = (
'id',
'iiif_url',
'word_results',
)
from django.conf.urls import url, include
from documents.api import ImagesSearch
urlpatterns = [
# Images search using indexes
url(r'^images/search/(?P<query>[\w\s]+)$', ImagesSearch.as_view(), name='images-search'),
]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment