Skip to content
Snippets Groups Projects
Commit e089f259 authored by Erwan Rouchet's avatar Erwan Rouchet
Browse files

Merge branch 'dates-in-search-api' into 'master'

Dates in search api

See merge request !267
parents 26286305 5e6f4fe4
No related branches found
No related tags found
1 merge request!267Dates in search api
...@@ -30,6 +30,7 @@ class PageSearch(SearchAPIView): ...@@ -30,6 +30,7 @@ class PageSearch(SearchAPIView):
class ActSearch(SearchAPIView): class ActSearch(SearchAPIView):
""" """
Search for acts containing a specific word Search for acts containing a specific word
within a specific period
""" """
serializer_class = ActSearchResultSerializer serializer_class = ActSearchResultSerializer
template_path = 'elastic/search_nested.json' template_path = 'elastic/search_nested.json'
......
from rest_framework import serializers from rest_framework import serializers
from arkindex.documents.models import Act, Page from arkindex.documents.models import Act, Page, TranscriptionType
from arkindex.documents.serializers.light import CorpusLightSerializer from arkindex.documents.serializers.light import CorpusLightSerializer
from arkindex.documents.serializers.elements import ElementLightSerializer from arkindex.documents.serializers.elements import ElementLightSerializer
from arkindex.documents.serializers.ml import TranscriptionSerializer from arkindex.documents.serializers.ml import TranscriptionSerializer
from arkindex.project.serializer_fields import EnumField, SimpleDateField, SearchTermsField
class SearchQuerySerializer(serializers.Serializer):
"""
Search parameters validation serializer in order to build a ES query
date_lte is rounded to superior value using ES syntax (see round_date()) and served as date_lt (exactly lower than)
Rounded value depend if month/day are provided (Ex: '<= march 1999' becomes '< april 1999')
"""
q = SearchTermsField(source='query')
score = serializers.FloatField(source='min_score', min_value=0.0, max_value=1.0, default=0.0)
date_gte = SimpleDateField(default='')
date_lte = SimpleDateField(source='date_lt', rounded=True, default='')
type = EnumField(enum=TranscriptionType, default='')
corpus = serializers.UUIDField(source='corpus_id', default='')
class PageSearchResultSerializer(serializers.ModelSerializer): class PageSearchResultSerializer(serializers.ModelSerializer):
......
from arkindex.project.tests import FixtureAPITestCase
from arkindex.documents.models import Corpus
from django.urls import reverse
from rest_framework import status
class TestSearchApi(FixtureAPITestCase):
@classmethod
def setUpTestData(cls):
super().setUpTestData()
cls.private_corpus = Corpus.objects.create(name='private', public=False)
def setUp(self):
super().setUp()
self.valid_params = (
{'q': 'a', 'score': '.7'},
{'q': 'one two', 'date_lte': '1333'},
{'q': 'one two', 'date_lte': '1333-12'},
{'q': 'one two', 'date_lte': '1333-12-02'},
{'q': 'one', 'type': 'page'},
{'q': 'cat', 'corpus': str(self.corpus.id), 'score': '0.9', 'date_lte': '1333-12-02'},
)
self.wrong_params = (
{'q': ' ', 'score': '0.7'},
{'q': 'one', 'score': '1.01'},
{'q': 'that', 'score': 'null'},
{'q': 'one two', 'date_lte': '1450-'},
{'q': 'one two', 'date_lte': '1450-02-30'},
{'q': 'one', 'type': 'wrongtype'},
{'q': 'cat', 'corpus': 'not_even_an_uuid'}
)
self.forbidden_params = (
{'q': 'knowledge', 'corpus': self.private_corpus.id},
)
def test_search_api(self):
"""
Check if different set of client-provided parameters are
correctly handled by search api endpoint
"""
for params in self.valid_params:
response = self.client.get(reverse('api:act-search'), params)
self.assertEqual(response.status_code, status.HTTP_200_OK)
for params in self.wrong_params:
response = self.client.get(reverse('api:act-search'), params)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
for params in self.forbidden_params:
response = self.client.get(reverse('api:act-search'), params)
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
from django.conf import settings from django.conf import settings
from django.shortcuts import get_object_or_404 from django.shortcuts import get_object_or_404
from rest_framework.exceptions import PermissionDenied, ValidationError, APIException from rest_framework.exceptions import PermissionDenied, APIException
from arkindex.documents.models import Corpus, Right from arkindex.documents.models import Corpus, Right
from arkindex.documents.serializers.search import SearchQuerySerializer
from arkindex.project.elastic import ESQuerySet from arkindex.project.elastic import ESQuerySet
from arkindex.project.tools import elasticsearch_escape
class CorpusACLMixin(object): class CorpusACLMixin(object):
...@@ -36,22 +36,15 @@ class SearchAPIMixin(CorpusACLMixin): ...@@ -36,22 +36,15 @@ class SearchAPIMixin(CorpusACLMixin):
es_sort = None es_sort = None
post_process_args = None post_process_args = None
def get(self, request, *args, **kwargs):
q = request.query_params.get('q')
if not q or q.isspace():
raise ValidationError('A search query is required')
return super().get(request, *args, **kwargs)
def get_context(self): def get_context(self):
context = { serializer = SearchQuerySerializer(context={'request': self.request}, data=self.request.query_params)
'query': elasticsearch_escape(self.request.query_params['q']), serializer.is_valid(raise_exception=True)
'type': self.request.query_params.get('type'), context = serializer.validated_data
'min_score': self.request.query_params.get('score'), context['inner_hits_size'] = settings.ES_INNER_RESULTS_LIMIT
'inner_hits_size': settings.ES_INNER_RESULTS_LIMIT, # TODO Handle corpus field in serializer too
} if context['corpus_id']:
if 'corpus' in self.request.query_params:
try: try:
context['corpus_id'] = str(self.get_corpus(self.request.query_params['corpus']).id) context['corpus_id'] = str(self.get_corpus(context['corpus_id']).id)
except Corpus.DoesNotExist: except Corpus.DoesNotExist:
raise PermissionDenied raise PermissionDenied
else: else:
......
...@@ -2,6 +2,11 @@ from django.conf import settings ...@@ -2,6 +2,11 @@ from django.conf import settings
from rest_framework import serializers from rest_framework import serializers
from enum import Enum from enum import Enum
from arkindex_common.ml_tool import MLTool, MLToolType from arkindex_common.ml_tool import MLTool, MLToolType
from arkindex.project.tools import elasticsearch_escape
import calendar
import re
DATE_REGEX = re.compile(r'^(?P<year>[1-2]\d{3})(-(?P<month>0[1-9]|1[0-2])(-(?P<day>\d{2}))?)?$')
class EnumField(serializers.ChoiceField): class EnumField(serializers.ChoiceField):
...@@ -20,7 +25,10 @@ class EnumField(serializers.ChoiceField): ...@@ -20,7 +25,10 @@ class EnumField(serializers.ChoiceField):
def to_internal_value(self, data): def to_internal_value(self, data):
assert self.enum is not None, "No enum set on EnumField" assert self.enum is not None, "No enum set on EnumField"
return self.enum(data) try:
return self.enum(data)
except ValueError:
raise serializers.ValidationError('Value is not of type {}'.format(self.enum.__name__))
class MLToolField(serializers.CharField): class MLToolField(serializers.CharField):
...@@ -38,3 +46,57 @@ class MLToolField(serializers.CharField): ...@@ -38,3 +46,57 @@ class MLToolField(serializers.CharField):
return MLTool.get(settings.ML_CLASSIFIERS_DIR, self.tool_type, data) return MLTool.get(settings.ML_CLASSIFIERS_DIR, self.tool_type, data)
except ValueError as e: except ValueError as e:
raise serializers.ValidationError(str(e)) raise serializers.ValidationError(str(e))
class SearchTermsField(serializers.CharField):
"""
Serialize ElastiSearch query terms
"""
def to_internal_value(self, query_terms):
return elasticsearch_escape(query_terms)
def date_validator(date):
"""
Validate that date format is understandable for ElasticSearch
Regex will match a mandatory year then optional 2-digit month and day.
Year (between 1000 and 2999) and month (between 1 and 12) are directly validated by the
regex whereas day, if found, is directly taken as a 2-digit number and checked later.
"""
match = DATE_REGEX.match(date)
if not match:
raise serializers.ValidationError('Could not parse date. format should be YYYY[-MM[-DD]]')
date_dict = match.groupdict()
# Validate day depending on year/month if a day is present
if date_dict['day']:
date_dict = {k: int(e) for k, e in date_dict.items()}
if not date_dict['day'] in range(1, calendar.monthrange(date_dict['year'], date_dict['month'])[1] + 1):
raise serializers.ValidationError('Could not parse date. Day is invalid')
def round_date(date):
"""
Add ElasticSearch suffix to round date to superior depending if month/day are provided
https://www.elastic.co/guide/en/elasticsearch/reference/6.2/common-options.html#date-math
"""
precision = '||+1d'
if re.match(r'^\d{4}-\d{2}$', date):
precision = '||+1M'
elif re.match(r'^\d{4}$', date):
precision = '||+1y'
return date + precision
class SimpleDateField(serializers.CharField):
"""
Serialize a date understandable by ElasticSearch
"""
def __init__(self, rounded=False, *args, **kwargs):
super().__init__(*args, **kwargs)
self.rounded = rounded
def to_internal_value(self, date):
date_validator(date)
if self.rounded:
date = round_date(date)
return date
{ {
"bool": { "bool": {
"filter": [ "filter": [
{% if date_gte or date_lt %}
{
"range": {
"date_range": {
{% if date_lt %}
"lt": "{{ date_lt }}",
{% endif %}
{% if date_gte %}
"gte": "{{ date_gte }}",
{% endif %}
"relation": "intersects"
}
}
},
{% endif %}
{% if corpus_id %} {% if corpus_id %}
{ {
"match": { "match": {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment