From 5e6f4fe43f610e53dd754863aa60b6cc0124322d Mon Sep 17 00:00:00 2001 From: Valentin Rigal <rigal@teklia.com> Date: Mon, 8 Apr 2019 14:53:28 +0000 Subject: [PATCH] Dates in search api --- arkindex/documents/api/search.py | 1 + arkindex/documents/serializers/search.py | 18 +++++- arkindex/documents/tests/test_search_api.py | 52 +++++++++++++++ arkindex/project/mixins.py | 25 +++----- arkindex/project/serializer_fields.py | 64 ++++++++++++++++++- arkindex/templates/elastic/search_nested.json | 15 +++++ 6 files changed, 157 insertions(+), 18 deletions(-) create mode 100644 arkindex/documents/tests/test_search_api.py diff --git a/arkindex/documents/api/search.py b/arkindex/documents/api/search.py index 5f38a4fbca..a454981984 100644 --- a/arkindex/documents/api/search.py +++ b/arkindex/documents/api/search.py @@ -30,6 +30,7 @@ class PageSearch(SearchAPIView): class ActSearch(SearchAPIView): """ Search for acts containing a specific word + within a specific period """ serializer_class = ActSearchResultSerializer template_path = 'elastic/search_nested.json' diff --git a/arkindex/documents/serializers/search.py b/arkindex/documents/serializers/search.py index f8767a897e..0d77b1a021 100644 --- a/arkindex/documents/serializers/search.py +++ b/arkindex/documents/serializers/search.py @@ -1,8 +1,24 @@ from rest_framework import serializers -from arkindex.documents.models import Act, Page +from arkindex.documents.models import Act, Page, TranscriptionType from arkindex.documents.serializers.light import CorpusLightSerializer from arkindex.documents.serializers.elements import ElementLightSerializer from arkindex.documents.serializers.ml import TranscriptionSerializer +from arkindex.project.serializer_fields import EnumField, SimpleDateField, SearchTermsField + + +class SearchQuerySerializer(serializers.Serializer): + """ + Search parameters validation serializer in order to build a ES query + + date_lte is rounded to superior value using ES syntax (see round_date()) and served as date_lt (exactly lower than) + Rounded value depend if month/day are provided (Ex: '<= march 1999' becomes '< april 1999') + """ + q = SearchTermsField(source='query') + score = serializers.FloatField(source='min_score', min_value=0.0, max_value=1.0, default=0.0) + date_gte = SimpleDateField(default='') + date_lte = SimpleDateField(source='date_lt', rounded=True, default='') + type = EnumField(enum=TranscriptionType, default='') + corpus = serializers.UUIDField(source='corpus_id', default='') class PageSearchResultSerializer(serializers.ModelSerializer): diff --git a/arkindex/documents/tests/test_search_api.py b/arkindex/documents/tests/test_search_api.py new file mode 100644 index 0000000000..dd03562515 --- /dev/null +++ b/arkindex/documents/tests/test_search_api.py @@ -0,0 +1,52 @@ +from arkindex.project.tests import FixtureAPITestCase +from arkindex.documents.models import Corpus +from django.urls import reverse +from rest_framework import status + + +class TestSearchApi(FixtureAPITestCase): + + @classmethod + def setUpTestData(cls): + super().setUpTestData() + cls.private_corpus = Corpus.objects.create(name='private', public=False) + + def setUp(self): + super().setUp() + self.valid_params = ( + {'q': 'a', 'score': '.7'}, + {'q': 'one two', 'date_lte': '1333'}, + {'q': 'one two', 'date_lte': '1333-12'}, + {'q': 'one two', 'date_lte': '1333-12-02'}, + {'q': 'one', 'type': 'page'}, + {'q': 'cat', 'corpus': str(self.corpus.id), 'score': '0.9', 'date_lte': '1333-12-02'}, + ) + self.wrong_params = ( + {'q': ' ', 'score': '0.7'}, + {'q': 'one', 'score': '1.01'}, + {'q': 'that', 'score': 'null'}, + {'q': 'one two', 'date_lte': '1450-'}, + {'q': 'one two', 'date_lte': '1450-02-30'}, + {'q': 'one', 'type': 'wrongtype'}, + {'q': 'cat', 'corpus': 'not_even_an_uuid'} + ) + self.forbidden_params = ( + {'q': 'knowledge', 'corpus': self.private_corpus.id}, + ) + + def test_search_api(self): + """ + Check if different set of client-provided parameters are + correctly handled by search api endpoint + """ + for params in self.valid_params: + response = self.client.get(reverse('api:act-search'), params) + self.assertEqual(response.status_code, status.HTTP_200_OK) + + for params in self.wrong_params: + response = self.client.get(reverse('api:act-search'), params) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + for params in self.forbidden_params: + response = self.client.get(reverse('api:act-search'), params) + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) diff --git a/arkindex/project/mixins.py b/arkindex/project/mixins.py index b98de51861..f408d72709 100644 --- a/arkindex/project/mixins.py +++ b/arkindex/project/mixins.py @@ -1,9 +1,9 @@ from django.conf import settings from django.shortcuts import get_object_or_404 -from rest_framework.exceptions import PermissionDenied, ValidationError, APIException +from rest_framework.exceptions import PermissionDenied, APIException from arkindex.documents.models import Corpus, Right +from arkindex.documents.serializers.search import SearchQuerySerializer from arkindex.project.elastic import ESQuerySet -from arkindex.project.tools import elasticsearch_escape class CorpusACLMixin(object): @@ -36,22 +36,15 @@ class SearchAPIMixin(CorpusACLMixin): es_sort = None post_process_args = None - def get(self, request, *args, **kwargs): - q = request.query_params.get('q') - if not q or q.isspace(): - raise ValidationError('A search query is required') - return super().get(request, *args, **kwargs) - def get_context(self): - context = { - 'query': elasticsearch_escape(self.request.query_params['q']), - 'type': self.request.query_params.get('type'), - 'min_score': self.request.query_params.get('score'), - 'inner_hits_size': settings.ES_INNER_RESULTS_LIMIT, - } - if 'corpus' in self.request.query_params: + serializer = SearchQuerySerializer(context={'request': self.request}, data=self.request.query_params) + serializer.is_valid(raise_exception=True) + context = serializer.validated_data + context['inner_hits_size'] = settings.ES_INNER_RESULTS_LIMIT + # TODO Handle corpus field in serializer too + if context['corpus_id']: try: - context['corpus_id'] = str(self.get_corpus(self.request.query_params['corpus']).id) + context['corpus_id'] = str(self.get_corpus(context['corpus_id']).id) except Corpus.DoesNotExist: raise PermissionDenied else: diff --git a/arkindex/project/serializer_fields.py b/arkindex/project/serializer_fields.py index 1b54ccdf9a..2bab2ddfa1 100644 --- a/arkindex/project/serializer_fields.py +++ b/arkindex/project/serializer_fields.py @@ -2,6 +2,11 @@ from django.conf import settings from rest_framework import serializers from enum import Enum from arkindex_common.ml_tool import MLTool, MLToolType +from arkindex.project.tools import elasticsearch_escape +import calendar +import re + +DATE_REGEX = re.compile(r'^(?P<year>[1-2]\d{3})(-(?P<month>0[1-9]|1[0-2])(-(?P<day>\d{2}))?)?$') class EnumField(serializers.ChoiceField): @@ -20,7 +25,10 @@ class EnumField(serializers.ChoiceField): def to_internal_value(self, data): assert self.enum is not None, "No enum set on EnumField" - return self.enum(data) + try: + return self.enum(data) + except ValueError: + raise serializers.ValidationError('Value is not of type {}'.format(self.enum.__name__)) class MLToolField(serializers.CharField): @@ -38,3 +46,57 @@ class MLToolField(serializers.CharField): return MLTool.get(settings.ML_CLASSIFIERS_DIR, self.tool_type, data) except ValueError as e: raise serializers.ValidationError(str(e)) + + +class SearchTermsField(serializers.CharField): + """ + Serialize ElastiSearch query terms + """ + def to_internal_value(self, query_terms): + return elasticsearch_escape(query_terms) + + +def date_validator(date): + """ + Validate that date format is understandable for ElasticSearch + Regex will match a mandatory year then optional 2-digit month and day. + Year (between 1000 and 2999) and month (between 1 and 12) are directly validated by the + regex whereas day, if found, is directly taken as a 2-digit number and checked later. + """ + match = DATE_REGEX.match(date) + if not match: + raise serializers.ValidationError('Could not parse date. format should be YYYY[-MM[-DD]]') + date_dict = match.groupdict() + # Validate day depending on year/month if a day is present + if date_dict['day']: + date_dict = {k: int(e) for k, e in date_dict.items()} + if not date_dict['day'] in range(1, calendar.monthrange(date_dict['year'], date_dict['month'])[1] + 1): + raise serializers.ValidationError('Could not parse date. Day is invalid') + + +def round_date(date): + """ + Add ElasticSearch suffix to round date to superior depending if month/day are provided + https://www.elastic.co/guide/en/elasticsearch/reference/6.2/common-options.html#date-math + """ + precision = '||+1d' + if re.match(r'^\d{4}-\d{2}$', date): + precision = '||+1M' + elif re.match(r'^\d{4}$', date): + precision = '||+1y' + return date + precision + + +class SimpleDateField(serializers.CharField): + """ + Serialize a date understandable by ElasticSearch + """ + def __init__(self, rounded=False, *args, **kwargs): + super().__init__(*args, **kwargs) + self.rounded = rounded + + def to_internal_value(self, date): + date_validator(date) + if self.rounded: + date = round_date(date) + return date diff --git a/arkindex/templates/elastic/search_nested.json b/arkindex/templates/elastic/search_nested.json index 12964f3238..556a61c1e5 100644 --- a/arkindex/templates/elastic/search_nested.json +++ b/arkindex/templates/elastic/search_nested.json @@ -1,6 +1,21 @@ { "bool": { "filter": [ + {% if date_gte or date_lt %} + { + "range": { + "date_range": { + {% if date_lt %} + "lt": "{{ date_lt }}", + {% endif %} + {% if date_gte %} + "gte": "{{ date_gte }}", + {% endif %} + "relation": "intersects" + } + } + }, + {% endif %} {% if corpus_id %} { "match": { -- GitLab