Skip to content
Snippets Groups Projects
Commit 5e6f4fe4 authored by Valentin Rigal's avatar Valentin Rigal Committed by Erwan Rouchet
Browse files

Dates in search api

parent 26286305
No related branches found
No related tags found
No related merge requests found
......@@ -30,6 +30,7 @@ class PageSearch(SearchAPIView):
class ActSearch(SearchAPIView):
"""
Search for acts containing a specific word
within a specific period
"""
serializer_class = ActSearchResultSerializer
template_path = 'elastic/search_nested.json'
......
from rest_framework import serializers
from arkindex.documents.models import Act, Page
from arkindex.documents.models import Act, Page, TranscriptionType
from arkindex.documents.serializers.light import CorpusLightSerializer
from arkindex.documents.serializers.elements import ElementLightSerializer
from arkindex.documents.serializers.ml import TranscriptionSerializer
from arkindex.project.serializer_fields import EnumField, SimpleDateField, SearchTermsField
class SearchQuerySerializer(serializers.Serializer):
"""
Search parameters validation serializer in order to build a ES query
date_lte is rounded to superior value using ES syntax (see round_date()) and served as date_lt (exactly lower than)
Rounded value depend if month/day are provided (Ex: '<= march 1999' becomes '< april 1999')
"""
q = SearchTermsField(source='query')
score = serializers.FloatField(source='min_score', min_value=0.0, max_value=1.0, default=0.0)
date_gte = SimpleDateField(default='')
date_lte = SimpleDateField(source='date_lt', rounded=True, default='')
type = EnumField(enum=TranscriptionType, default='')
corpus = serializers.UUIDField(source='corpus_id', default='')
class PageSearchResultSerializer(serializers.ModelSerializer):
......
from arkindex.project.tests import FixtureAPITestCase
from arkindex.documents.models import Corpus
from django.urls import reverse
from rest_framework import status
class TestSearchApi(FixtureAPITestCase):
@classmethod
def setUpTestData(cls):
super().setUpTestData()
cls.private_corpus = Corpus.objects.create(name='private', public=False)
def setUp(self):
super().setUp()
self.valid_params = (
{'q': 'a', 'score': '.7'},
{'q': 'one two', 'date_lte': '1333'},
{'q': 'one two', 'date_lte': '1333-12'},
{'q': 'one two', 'date_lte': '1333-12-02'},
{'q': 'one', 'type': 'page'},
{'q': 'cat', 'corpus': str(self.corpus.id), 'score': '0.9', 'date_lte': '1333-12-02'},
)
self.wrong_params = (
{'q': ' ', 'score': '0.7'},
{'q': 'one', 'score': '1.01'},
{'q': 'that', 'score': 'null'},
{'q': 'one two', 'date_lte': '1450-'},
{'q': 'one two', 'date_lte': '1450-02-30'},
{'q': 'one', 'type': 'wrongtype'},
{'q': 'cat', 'corpus': 'not_even_an_uuid'}
)
self.forbidden_params = (
{'q': 'knowledge', 'corpus': self.private_corpus.id},
)
def test_search_api(self):
"""
Check if different set of client-provided parameters are
correctly handled by search api endpoint
"""
for params in self.valid_params:
response = self.client.get(reverse('api:act-search'), params)
self.assertEqual(response.status_code, status.HTTP_200_OK)
for params in self.wrong_params:
response = self.client.get(reverse('api:act-search'), params)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
for params in self.forbidden_params:
response = self.client.get(reverse('api:act-search'), params)
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
from django.conf import settings
from django.shortcuts import get_object_or_404
from rest_framework.exceptions import PermissionDenied, ValidationError, APIException
from rest_framework.exceptions import PermissionDenied, APIException
from arkindex.documents.models import Corpus, Right
from arkindex.documents.serializers.search import SearchQuerySerializer
from arkindex.project.elastic import ESQuerySet
from arkindex.project.tools import elasticsearch_escape
class CorpusACLMixin(object):
......@@ -36,22 +36,15 @@ class SearchAPIMixin(CorpusACLMixin):
es_sort = None
post_process_args = None
def get(self, request, *args, **kwargs):
q = request.query_params.get('q')
if not q or q.isspace():
raise ValidationError('A search query is required')
return super().get(request, *args, **kwargs)
def get_context(self):
context = {
'query': elasticsearch_escape(self.request.query_params['q']),
'type': self.request.query_params.get('type'),
'min_score': self.request.query_params.get('score'),
'inner_hits_size': settings.ES_INNER_RESULTS_LIMIT,
}
if 'corpus' in self.request.query_params:
serializer = SearchQuerySerializer(context={'request': self.request}, data=self.request.query_params)
serializer.is_valid(raise_exception=True)
context = serializer.validated_data
context['inner_hits_size'] = settings.ES_INNER_RESULTS_LIMIT
# TODO Handle corpus field in serializer too
if context['corpus_id']:
try:
context['corpus_id'] = str(self.get_corpus(self.request.query_params['corpus']).id)
context['corpus_id'] = str(self.get_corpus(context['corpus_id']).id)
except Corpus.DoesNotExist:
raise PermissionDenied
else:
......
......@@ -2,6 +2,11 @@ from django.conf import settings
from rest_framework import serializers
from enum import Enum
from arkindex_common.ml_tool import MLTool, MLToolType
from arkindex.project.tools import elasticsearch_escape
import calendar
import re
DATE_REGEX = re.compile(r'^(?P<year>[1-2]\d{3})(-(?P<month>0[1-9]|1[0-2])(-(?P<day>\d{2}))?)?$')
class EnumField(serializers.ChoiceField):
......@@ -20,7 +25,10 @@ class EnumField(serializers.ChoiceField):
def to_internal_value(self, data):
assert self.enum is not None, "No enum set on EnumField"
return self.enum(data)
try:
return self.enum(data)
except ValueError:
raise serializers.ValidationError('Value is not of type {}'.format(self.enum.__name__))
class MLToolField(serializers.CharField):
......@@ -38,3 +46,57 @@ class MLToolField(serializers.CharField):
return MLTool.get(settings.ML_CLASSIFIERS_DIR, self.tool_type, data)
except ValueError as e:
raise serializers.ValidationError(str(e))
class SearchTermsField(serializers.CharField):
"""
Serialize ElastiSearch query terms
"""
def to_internal_value(self, query_terms):
return elasticsearch_escape(query_terms)
def date_validator(date):
"""
Validate that date format is understandable for ElasticSearch
Regex will match a mandatory year then optional 2-digit month and day.
Year (between 1000 and 2999) and month (between 1 and 12) are directly validated by the
regex whereas day, if found, is directly taken as a 2-digit number and checked later.
"""
match = DATE_REGEX.match(date)
if not match:
raise serializers.ValidationError('Could not parse date. format should be YYYY[-MM[-DD]]')
date_dict = match.groupdict()
# Validate day depending on year/month if a day is present
if date_dict['day']:
date_dict = {k: int(e) for k, e in date_dict.items()}
if not date_dict['day'] in range(1, calendar.monthrange(date_dict['year'], date_dict['month'])[1] + 1):
raise serializers.ValidationError('Could not parse date. Day is invalid')
def round_date(date):
"""
Add ElasticSearch suffix to round date to superior depending if month/day are provided
https://www.elastic.co/guide/en/elasticsearch/reference/6.2/common-options.html#date-math
"""
precision = '||+1d'
if re.match(r'^\d{4}-\d{2}$', date):
precision = '||+1M'
elif re.match(r'^\d{4}$', date):
precision = '||+1y'
return date + precision
class SimpleDateField(serializers.CharField):
"""
Serialize a date understandable by ElasticSearch
"""
def __init__(self, rounded=False, *args, **kwargs):
super().__init__(*args, **kwargs)
self.rounded = rounded
def to_internal_value(self, date):
date_validator(date)
if self.rounded:
date = round_date(date)
return date
{
"bool": {
"filter": [
{% if date_gte or date_lt %}
{
"range": {
"date_range": {
{% if date_lt %}
"lt": "{{ date_lt }}",
{% endif %}
{% if date_gte %}
"gte": "{{ date_gte }}",
{% endif %}
"relation": "intersects"
}
}
},
{% endif %}
{% if corpus_id %}
{
"match": {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment