Skip to content
Snippets Groups Projects
Commit e089f259 authored by Erwan Rouchet's avatar Erwan Rouchet
Browse files

Merge branch 'dates-in-search-api' into 'master'

Dates in search api

See merge request !267
parents 26286305 5e6f4fe4
No related branches found
No related tags found
1 merge request!267Dates in search api
......@@ -30,6 +30,7 @@ class PageSearch(SearchAPIView):
class ActSearch(SearchAPIView):
"""
Search for acts containing a specific word
within a specific period
"""
serializer_class = ActSearchResultSerializer
template_path = 'elastic/search_nested.json'
......
from rest_framework import serializers
from arkindex.documents.models import Act, Page
from arkindex.documents.models import Act, Page, TranscriptionType
from arkindex.documents.serializers.light import CorpusLightSerializer
from arkindex.documents.serializers.elements import ElementLightSerializer
from arkindex.documents.serializers.ml import TranscriptionSerializer
from arkindex.project.serializer_fields import EnumField, SimpleDateField, SearchTermsField
class SearchQuerySerializer(serializers.Serializer):
"""
Search parameters validation serializer in order to build a ES query
date_lte is rounded to superior value using ES syntax (see round_date()) and served as date_lt (exactly lower than)
Rounded value depend if month/day are provided (Ex: '<= march 1999' becomes '< april 1999')
"""
q = SearchTermsField(source='query')
score = serializers.FloatField(source='min_score', min_value=0.0, max_value=1.0, default=0.0)
date_gte = SimpleDateField(default='')
date_lte = SimpleDateField(source='date_lt', rounded=True, default='')
type = EnumField(enum=TranscriptionType, default='')
corpus = serializers.UUIDField(source='corpus_id', default='')
class PageSearchResultSerializer(serializers.ModelSerializer):
......
from arkindex.project.tests import FixtureAPITestCase
from arkindex.documents.models import Corpus
from django.urls import reverse
from rest_framework import status
class TestSearchApi(FixtureAPITestCase):
@classmethod
def setUpTestData(cls):
super().setUpTestData()
cls.private_corpus = Corpus.objects.create(name='private', public=False)
def setUp(self):
super().setUp()
self.valid_params = (
{'q': 'a', 'score': '.7'},
{'q': 'one two', 'date_lte': '1333'},
{'q': 'one two', 'date_lte': '1333-12'},
{'q': 'one two', 'date_lte': '1333-12-02'},
{'q': 'one', 'type': 'page'},
{'q': 'cat', 'corpus': str(self.corpus.id), 'score': '0.9', 'date_lte': '1333-12-02'},
)
self.wrong_params = (
{'q': ' ', 'score': '0.7'},
{'q': 'one', 'score': '1.01'},
{'q': 'that', 'score': 'null'},
{'q': 'one two', 'date_lte': '1450-'},
{'q': 'one two', 'date_lte': '1450-02-30'},
{'q': 'one', 'type': 'wrongtype'},
{'q': 'cat', 'corpus': 'not_even_an_uuid'}
)
self.forbidden_params = (
{'q': 'knowledge', 'corpus': self.private_corpus.id},
)
def test_search_api(self):
"""
Check if different set of client-provided parameters are
correctly handled by search api endpoint
"""
for params in self.valid_params:
response = self.client.get(reverse('api:act-search'), params)
self.assertEqual(response.status_code, status.HTTP_200_OK)
for params in self.wrong_params:
response = self.client.get(reverse('api:act-search'), params)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
for params in self.forbidden_params:
response = self.client.get(reverse('api:act-search'), params)
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
from django.conf import settings
from django.shortcuts import get_object_or_404
from rest_framework.exceptions import PermissionDenied, ValidationError, APIException
from rest_framework.exceptions import PermissionDenied, APIException
from arkindex.documents.models import Corpus, Right
from arkindex.documents.serializers.search import SearchQuerySerializer
from arkindex.project.elastic import ESQuerySet
from arkindex.project.tools import elasticsearch_escape
class CorpusACLMixin(object):
......@@ -36,22 +36,15 @@ class SearchAPIMixin(CorpusACLMixin):
es_sort = None
post_process_args = None
def get(self, request, *args, **kwargs):
q = request.query_params.get('q')
if not q or q.isspace():
raise ValidationError('A search query is required')
return super().get(request, *args, **kwargs)
def get_context(self):
context = {
'query': elasticsearch_escape(self.request.query_params['q']),
'type': self.request.query_params.get('type'),
'min_score': self.request.query_params.get('score'),
'inner_hits_size': settings.ES_INNER_RESULTS_LIMIT,
}
if 'corpus' in self.request.query_params:
serializer = SearchQuerySerializer(context={'request': self.request}, data=self.request.query_params)
serializer.is_valid(raise_exception=True)
context = serializer.validated_data
context['inner_hits_size'] = settings.ES_INNER_RESULTS_LIMIT
# TODO Handle corpus field in serializer too
if context['corpus_id']:
try:
context['corpus_id'] = str(self.get_corpus(self.request.query_params['corpus']).id)
context['corpus_id'] = str(self.get_corpus(context['corpus_id']).id)
except Corpus.DoesNotExist:
raise PermissionDenied
else:
......
......@@ -2,6 +2,11 @@ from django.conf import settings
from rest_framework import serializers
from enum import Enum
from arkindex_common.ml_tool import MLTool, MLToolType
from arkindex.project.tools import elasticsearch_escape
import calendar
import re
DATE_REGEX = re.compile(r'^(?P<year>[1-2]\d{3})(-(?P<month>0[1-9]|1[0-2])(-(?P<day>\d{2}))?)?$')
class EnumField(serializers.ChoiceField):
......@@ -20,7 +25,10 @@ class EnumField(serializers.ChoiceField):
def to_internal_value(self, data):
assert self.enum is not None, "No enum set on EnumField"
return self.enum(data)
try:
return self.enum(data)
except ValueError:
raise serializers.ValidationError('Value is not of type {}'.format(self.enum.__name__))
class MLToolField(serializers.CharField):
......@@ -38,3 +46,57 @@ class MLToolField(serializers.CharField):
return MLTool.get(settings.ML_CLASSIFIERS_DIR, self.tool_type, data)
except ValueError as e:
raise serializers.ValidationError(str(e))
class SearchTermsField(serializers.CharField):
"""
Serialize ElastiSearch query terms
"""
def to_internal_value(self, query_terms):
return elasticsearch_escape(query_terms)
def date_validator(date):
"""
Validate that date format is understandable for ElasticSearch
Regex will match a mandatory year then optional 2-digit month and day.
Year (between 1000 and 2999) and month (between 1 and 12) are directly validated by the
regex whereas day, if found, is directly taken as a 2-digit number and checked later.
"""
match = DATE_REGEX.match(date)
if not match:
raise serializers.ValidationError('Could not parse date. format should be YYYY[-MM[-DD]]')
date_dict = match.groupdict()
# Validate day depending on year/month if a day is present
if date_dict['day']:
date_dict = {k: int(e) for k, e in date_dict.items()}
if not date_dict['day'] in range(1, calendar.monthrange(date_dict['year'], date_dict['month'])[1] + 1):
raise serializers.ValidationError('Could not parse date. Day is invalid')
def round_date(date):
"""
Add ElasticSearch suffix to round date to superior depending if month/day are provided
https://www.elastic.co/guide/en/elasticsearch/reference/6.2/common-options.html#date-math
"""
precision = '||+1d'
if re.match(r'^\d{4}-\d{2}$', date):
precision = '||+1M'
elif re.match(r'^\d{4}$', date):
precision = '||+1y'
return date + precision
class SimpleDateField(serializers.CharField):
"""
Serialize a date understandable by ElasticSearch
"""
def __init__(self, rounded=False, *args, **kwargs):
super().__init__(*args, **kwargs)
self.rounded = rounded
def to_internal_value(self, date):
date_validator(date)
if self.rounded:
date = round_date(date)
return date
{
"bool": {
"filter": [
{% if date_gte or date_lt %}
{
"range": {
"date_range": {
{% if date_lt %}
"lt": "{{ date_lt }}",
{% endif %}
{% if date_gte %}
"gte": "{{ date_gte }}",
{% endif %}
"relation": "intersects"
}
}
},
{% endif %}
{% if corpus_id %}
{
"match": {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment