From 5e6f4fe43f610e53dd754863aa60b6cc0124322d Mon Sep 17 00:00:00 2001
From: Valentin Rigal <rigal@teklia.com>
Date: Mon, 8 Apr 2019 14:53:28 +0000
Subject: [PATCH] Dates in search api

---
 arkindex/documents/api/search.py              |  1 +
 arkindex/documents/serializers/search.py      | 18 +++++-
 arkindex/documents/tests/test_search_api.py   | 52 +++++++++++++++
 arkindex/project/mixins.py                    | 25 +++-----
 arkindex/project/serializer_fields.py         | 64 ++++++++++++++++++-
 arkindex/templates/elastic/search_nested.json | 15 +++++
 6 files changed, 157 insertions(+), 18 deletions(-)
 create mode 100644 arkindex/documents/tests/test_search_api.py

diff --git a/arkindex/documents/api/search.py b/arkindex/documents/api/search.py
index 5f38a4fbca..a454981984 100644
--- a/arkindex/documents/api/search.py
+++ b/arkindex/documents/api/search.py
@@ -30,6 +30,7 @@ class PageSearch(SearchAPIView):
 class ActSearch(SearchAPIView):
     """
     Search for acts containing a specific word
+    within a specific period
     """
     serializer_class = ActSearchResultSerializer
     template_path = 'elastic/search_nested.json'
diff --git a/arkindex/documents/serializers/search.py b/arkindex/documents/serializers/search.py
index f8767a897e..0d77b1a021 100644
--- a/arkindex/documents/serializers/search.py
+++ b/arkindex/documents/serializers/search.py
@@ -1,8 +1,24 @@
 from rest_framework import serializers
-from arkindex.documents.models import Act, Page
+from arkindex.documents.models import Act, Page, TranscriptionType
 from arkindex.documents.serializers.light import CorpusLightSerializer
 from arkindex.documents.serializers.elements import ElementLightSerializer
 from arkindex.documents.serializers.ml import TranscriptionSerializer
+from arkindex.project.serializer_fields import EnumField, SimpleDateField, SearchTermsField
+
+
+class SearchQuerySerializer(serializers.Serializer):
+    """
+    Search parameters validation serializer in order to build a ES query
+
+    date_lte is rounded to superior value using ES syntax (see round_date()) and served as date_lt (exactly lower than)
+    Rounded value depend if month/day are provided (Ex: '<= march 1999' becomes '< april 1999')
+    """
+    q = SearchTermsField(source='query')
+    score = serializers.FloatField(source='min_score', min_value=0.0, max_value=1.0, default=0.0)
+    date_gte = SimpleDateField(default='')
+    date_lte = SimpleDateField(source='date_lt', rounded=True, default='')
+    type = EnumField(enum=TranscriptionType, default='')
+    corpus = serializers.UUIDField(source='corpus_id', default='')
 
 
 class PageSearchResultSerializer(serializers.ModelSerializer):
diff --git a/arkindex/documents/tests/test_search_api.py b/arkindex/documents/tests/test_search_api.py
new file mode 100644
index 0000000000..dd03562515
--- /dev/null
+++ b/arkindex/documents/tests/test_search_api.py
@@ -0,0 +1,52 @@
+from arkindex.project.tests import FixtureAPITestCase
+from arkindex.documents.models import Corpus
+from django.urls import reverse
+from rest_framework import status
+
+
+class TestSearchApi(FixtureAPITestCase):
+
+    @classmethod
+    def setUpTestData(cls):
+        super().setUpTestData()
+        cls.private_corpus = Corpus.objects.create(name='private', public=False)
+
+    def setUp(self):
+        super().setUp()
+        self.valid_params = (
+            {'q': 'a', 'score': '.7'},
+            {'q': 'one two', 'date_lte': '1333'},
+            {'q': 'one two', 'date_lte': '1333-12'},
+            {'q': 'one two', 'date_lte': '1333-12-02'},
+            {'q': 'one', 'type': 'page'},
+            {'q': 'cat', 'corpus': str(self.corpus.id), 'score': '0.9', 'date_lte': '1333-12-02'},
+        )
+        self.wrong_params = (
+            {'q': ' ', 'score': '0.7'},
+            {'q': 'one', 'score': '1.01'},
+            {'q': 'that', 'score': 'null'},
+            {'q': 'one two', 'date_lte': '1450-'},
+            {'q': 'one two', 'date_lte': '1450-02-30'},
+            {'q': 'one', 'type': 'wrongtype'},
+            {'q': 'cat', 'corpus': 'not_even_an_uuid'}
+        )
+        self.forbidden_params = (
+            {'q': 'knowledge', 'corpus': self.private_corpus.id},
+        )
+
+    def test_search_api(self):
+        """
+        Check if different set of client-provided parameters are
+        correctly handled by search api endpoint
+        """
+        for params in self.valid_params:
+            response = self.client.get(reverse('api:act-search'), params)
+            self.assertEqual(response.status_code, status.HTTP_200_OK)
+
+        for params in self.wrong_params:
+            response = self.client.get(reverse('api:act-search'), params)
+            self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
+
+        for params in self.forbidden_params:
+            response = self.client.get(reverse('api:act-search'), params)
+            self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
diff --git a/arkindex/project/mixins.py b/arkindex/project/mixins.py
index b98de51861..f408d72709 100644
--- a/arkindex/project/mixins.py
+++ b/arkindex/project/mixins.py
@@ -1,9 +1,9 @@
 from django.conf import settings
 from django.shortcuts import get_object_or_404
-from rest_framework.exceptions import PermissionDenied, ValidationError, APIException
+from rest_framework.exceptions import PermissionDenied, APIException
 from arkindex.documents.models import Corpus, Right
+from arkindex.documents.serializers.search import SearchQuerySerializer
 from arkindex.project.elastic import ESQuerySet
-from arkindex.project.tools import elasticsearch_escape
 
 
 class CorpusACLMixin(object):
@@ -36,22 +36,15 @@ class SearchAPIMixin(CorpusACLMixin):
     es_sort = None
     post_process_args = None
 
-    def get(self, request, *args, **kwargs):
-        q = request.query_params.get('q')
-        if not q or q.isspace():
-            raise ValidationError('A search query is required')
-        return super().get(request, *args, **kwargs)
-
     def get_context(self):
-        context = {
-            'query': elasticsearch_escape(self.request.query_params['q']),
-            'type': self.request.query_params.get('type'),
-            'min_score': self.request.query_params.get('score'),
-            'inner_hits_size': settings.ES_INNER_RESULTS_LIMIT,
-        }
-        if 'corpus' in self.request.query_params:
+        serializer = SearchQuerySerializer(context={'request': self.request}, data=self.request.query_params)
+        serializer.is_valid(raise_exception=True)
+        context = serializer.validated_data
+        context['inner_hits_size'] = settings.ES_INNER_RESULTS_LIMIT
+        # TODO Handle corpus field in serializer too
+        if context['corpus_id']:
             try:
-                context['corpus_id'] = str(self.get_corpus(self.request.query_params['corpus']).id)
+                context['corpus_id'] = str(self.get_corpus(context['corpus_id']).id)
             except Corpus.DoesNotExist:
                 raise PermissionDenied
         else:
diff --git a/arkindex/project/serializer_fields.py b/arkindex/project/serializer_fields.py
index 1b54ccdf9a..2bab2ddfa1 100644
--- a/arkindex/project/serializer_fields.py
+++ b/arkindex/project/serializer_fields.py
@@ -2,6 +2,11 @@ from django.conf import settings
 from rest_framework import serializers
 from enum import Enum
 from arkindex_common.ml_tool import MLTool, MLToolType
+from arkindex.project.tools import elasticsearch_escape
+import calendar
+import re
+
+DATE_REGEX = re.compile(r'^(?P<year>[1-2]\d{3})(-(?P<month>0[1-9]|1[0-2])(-(?P<day>\d{2}))?)?$')
 
 
 class EnumField(serializers.ChoiceField):
@@ -20,7 +25,10 @@ class EnumField(serializers.ChoiceField):
 
     def to_internal_value(self, data):
         assert self.enum is not None, "No enum set on EnumField"
-        return self.enum(data)
+        try:
+            return self.enum(data)
+        except ValueError:
+            raise serializers.ValidationError('Value is not of type {}'.format(self.enum.__name__))
 
 
 class MLToolField(serializers.CharField):
@@ -38,3 +46,57 @@ class MLToolField(serializers.CharField):
             return MLTool.get(settings.ML_CLASSIFIERS_DIR, self.tool_type, data)
         except ValueError as e:
             raise serializers.ValidationError(str(e))
+
+
+class SearchTermsField(serializers.CharField):
+    """
+    Serialize ElastiSearch query terms
+    """
+    def to_internal_value(self, query_terms):
+        return elasticsearch_escape(query_terms)
+
+
+def date_validator(date):
+    """
+    Validate that date format is understandable for ElasticSearch
+    Regex will match a mandatory year then optional 2-digit month and day.
+    Year (between 1000 and 2999) and month (between 1 and 12) are directly validated by the
+    regex whereas day, if found, is directly taken as a 2-digit number and checked later.
+    """
+    match = DATE_REGEX.match(date)
+    if not match:
+        raise serializers.ValidationError('Could not parse date. format should be YYYY[-MM[-DD]]')
+    date_dict = match.groupdict()
+    # Validate day depending on year/month if a day is present
+    if date_dict['day']:
+        date_dict = {k: int(e) for k, e in date_dict.items()}
+        if not date_dict['day'] in range(1, calendar.monthrange(date_dict['year'], date_dict['month'])[1] + 1):
+            raise serializers.ValidationError('Could not parse date. Day is invalid')
+
+
+def round_date(date):
+    """
+    Add ElasticSearch suffix to round date to superior depending if month/day are provided
+    https://www.elastic.co/guide/en/elasticsearch/reference/6.2/common-options.html#date-math
+    """
+    precision = '||+1d'
+    if re.match(r'^\d{4}-\d{2}$', date):
+        precision = '||+1M'
+    elif re.match(r'^\d{4}$', date):
+        precision = '||+1y'
+    return date + precision
+
+
+class SimpleDateField(serializers.CharField):
+    """
+    Serialize a date understandable by ElasticSearch
+    """
+    def __init__(self, rounded=False, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.rounded = rounded
+
+    def to_internal_value(self, date):
+        date_validator(date)
+        if self.rounded:
+            date = round_date(date)
+        return date
diff --git a/arkindex/templates/elastic/search_nested.json b/arkindex/templates/elastic/search_nested.json
index 12964f3238..556a61c1e5 100644
--- a/arkindex/templates/elastic/search_nested.json
+++ b/arkindex/templates/elastic/search_nested.json
@@ -1,6 +1,21 @@
 {
     "bool": {
         "filter": [
+            {% if date_gte or date_lt %}
+            {
+                "range": {
+                    "date_range": {
+                        {% if date_lt %}
+                        "lt": "{{ date_lt }}",
+                        {% endif %}
+                        {% if date_gte %}
+                        "gte": "{{ date_gte }}",
+                        {% endif %}
+                        "relation": "intersects"
+                    }
+                }
+            },
+            {% endif %}
             {% if corpus_id %}
             {
                 "match": {
-- 
GitLab