From 26161ca2291510894f27d82efb0790408431ef6d Mon Sep 17 00:00:00 2001
From: Bastien Abadie <bastien@nextcairn.com>
Date: Wed, 25 Oct 2017 17:00:56 +0200
Subject: [PATCH] Search API with images+indexes, sorted by best index first

---
 src/documents/api.py         | 19 ++++++++++++
 src/documents/models.py      | 58 ++++++++++++++++++++++++++++++++++--
 src/documents/serializers.py | 23 ++++++++++++++
 src/horae/api.py             |  4 +++
 4 files changed, 102 insertions(+), 2 deletions(-)
 create mode 100644 src/documents/api.py
 create mode 100644 src/documents/serializers.py

diff --git a/src/documents/api.py b/src/documents/api.py
new file mode 100644
index 0000000000..439147109a
--- /dev/null
+++ b/src/documents/api.py
@@ -0,0 +1,19 @@
+from rest_framework.generics import ListAPIView
+from documents.serializers import ImageSearchSerializer
+from documents.models import Image
+
+class ImagesSearch(ListAPIView):
+    """
+    Search through stored images
+    """
+    serializer_class = ImageSearchSerializer
+
+    def get_queryset(self):
+        """
+        Support local memory cache to avoid multiple hits on ES
+        """
+        if hasattr(self, '_cache'):
+            return self._cache
+
+        self._cache = Image.search(self.kwargs['query'])
+        return self._cache
diff --git a/src/documents/models.py b/src/documents/models.py
index 9468849530..8d36dc5963 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -4,10 +4,14 @@ from elasticsearch import Elasticsearch
 from elasticsearch.helpers import bulk as es_bulk, scan as es_scan
 import requests
 import hashlib
+import itertools
 import json
 import base64
 import uuid
 
+ES_DOC_INDEX_TYPE = 'word'
+ES_INDEX = 'volume_xxx'
+
 class Image(models.Model):
     """
     A document image
@@ -21,6 +25,56 @@ class Image(models.Model):
     def __str__(self):
         return '{} - {}'.format(self.id, self.iiif_url)
 
+    @staticmethod
+    def search(query):
+        """
+        Search into ElasticSearch
+        """
+        query = {
+            'bool' : {
+                'must': [
+                    {
+                        'match' : {
+                            'word': query,
+                        }
+                    },
+                ],
+            }
+        }
+        elastic = Elasticsearch(settings.ELASTIC_SEARCH_HOSTS)
+        results = es_scan(
+            client=elastic,
+            index=ES_INDEX,
+            query={'query': query},
+            doc_type=ES_DOC_INDEX_TYPE,
+        )
+
+        # Group by images, as an in-memory dict
+        # dict keys are image ids
+        grouper = lambda r : r['_source']['image']
+        results = sorted(results, key=grouper)
+        indexes = {
+            image_id : [i['_source'] for i in indexes] # in memory copy
+            for image_id, indexes in itertools.groupby(results, grouper)
+        }
+
+        # Load all images from results
+        images = Image.objects.filter(pk__in=indexes.keys())
+
+        # Map images with their indexes
+        for image in images:
+            image.word_results = indexes[image.id.hex]
+
+        # Sort images by best index score
+        def _best_index(image):
+            return max([
+                index['score']
+                for index in image.word_results
+            ])
+        images = sorted(images, key=_best_index, reverse=True)
+
+        return images
+
     def check_source(self):
         """
         Check the image is available through this url
@@ -63,8 +117,8 @@ class Image(models.Model):
 
         # Build raw ElasticSearch insert
         actions = [{
-            '_index': 'volume_xxx', # TODO: use volume ?
-            '_type': 'word',
+            '_index': ES_INDEX,
+            '_type': ES_DOC_INDEX_TYPE,
             '_source': index_data,
             '_id': index_id,
         } for index_id, index_data in indexes]
diff --git a/src/documents/serializers.py b/src/documents/serializers.py
new file mode 100644
index 0000000000..60098e0939
--- /dev/null
+++ b/src/documents/serializers.py
@@ -0,0 +1,23 @@
+from rest_framework import serializers
+from documents.models import Image
+
+
+class WordSerializer(serializers.Serializer):
+    word = serializers.CharField()
+    score = serializers.CharField()
+    box = serializers.ListField()
+
+class ImageSearchSerializer(serializers.ModelSerializer):
+    """
+    Serialises an image with its index
+    issued from a search
+    """
+    word_results = WordSerializer(many=True)
+
+    class Meta:
+        model = Image
+        fields = (
+            'id',
+            'iiif_url',
+            'word_results',
+        )
diff --git a/src/horae/api.py b/src/horae/api.py
index ba1c226c18..546f91921f 100644
--- a/src/horae/api.py
+++ b/src/horae/api.py
@@ -1,5 +1,9 @@
 from django.conf.urls import url, include
+from documents.api import ImagesSearch
 
 
 urlpatterns = [
+
+    # Images search using indexes
+    url(r'^images/search/(?P<query>[\w\s]+)$', ImagesSearch.as_view(), name='images-search'),
 ]
-- 
GitLab