Something went wrong on our end
-
Erwan Rouchet authoredErwan Rouchet authored
search.py 7.31 KiB
from textwrap import dedent
from rest_framework import serializers
from rest_framework.exceptions import ValidationError
from arkindex.documents.models import MetaType
from arkindex.project.rq_overrides import get_existing_job
from arkindex.project.serializer_fields import EnumField
from arkindex.project.triggers import reindex_corpus
class SolrDocumentSerializer(serializers.Serializer):
"""
Serializes a Solr document
"""
id = serializers.UUIDField()
parent_id = serializers.UUIDField(allow_null=True)
parent_name = serializers.CharField(allow_null=True)
parent_type = serializers.CharField(allow_null=True)
element_id = serializers.UUIDField()
element_text = serializers.CharField()
element_type = serializers.CharField()
element_worker = serializers.CharField(allow_null=True)
element_image = serializers.URLField(allow_null=True)
transcription_id = serializers.UUIDField(allow_null=True)
transcription_text = serializers.CharField(allow_null=True)
transcription_confidence = serializers.FloatField(min_value=0, max_value=1, allow_null=True)
transcription_worker = serializers.CharField(allow_null=True)
classification_id = serializers.UUIDField(allow_null=True)
classification_name = serializers.CharField(allow_null=True)
classification_confidence = serializers.FloatField(min_value=0, max_value=1, allow_null=True)
classification_worker = serializers.CharField(allow_null=True)
metadata_id = serializers.UUIDField(allow_null=True)
metadata_name = serializers.CharField(allow_null=True)
metadata_text = serializers.CharField(allow_null=True)
metadata_type = EnumField(MetaType, allow_null=True)
metadata_worker = serializers.CharField(allow_null=True)
entity_id = serializers.UUIDField(allow_null=True)
entity_text = serializers.CharField(allow_null=True)
entity_type = serializers.CharField(allow_null=True)
entity_worker = serializers.CharField(allow_null=True)
class SolrFacetsSerializer(serializers.Serializer):
"""
Serializes Solr facets
"""
element_type = serializers.DictField(child=serializers.IntegerField(min_value=1))
element_worker = serializers.DictField(child=serializers.IntegerField(min_value=1))
transcription_worker = serializers.DictField(child=serializers.IntegerField(min_value=1))
classification_name = serializers.DictField(child=serializers.IntegerField(min_value=1))
classification_worker = serializers.DictField(child=serializers.IntegerField(min_value=1))
metadata_name = serializers.DictField(child=serializers.IntegerField(min_value=1))
metadata_type = serializers.DictField(child=serializers.IntegerField(min_value=1))
metadata_worker = serializers.DictField(child=serializers.IntegerField(min_value=1))
entity_type = serializers.DictField(child=serializers.IntegerField(min_value=1))
entity_worker = serializers.DictField(child=serializers.IntegerField(min_value=1))
class CorpusSearchResultSerializer(serializers.Serializer):
"""
Serializes Corpus Search results
"""
count = serializers.IntegerField()
number = serializers.IntegerField()
next = serializers.URLField(allow_null=True)
previous = serializers.URLField(allow_null=True)
results = SolrDocumentSerializer(many=True, allow_null=True)
facets = SolrFacetsSerializer()
class CorpusSearchQuerySerializer(serializers.Serializer):
"""
Search parameters validation serializer in order to build a Solr query
"""
query = serializers.CharField(help_text=dedent('''
Search query on transcription text, metadata value or element and entity name.
* By default, will search for the exact word (`a` will not match `zab`)
* Prefix and suffix the word with `*` to search it in a text (`*a*` will match `zab`)
* By default, will look for only one of each word (`a b` is translated to `a or b`).
* Combine words with `and` to require multiple words.
* Prefix a word with `-` or `!` to filter out items that contain a word.
* Add `*` around multiple words to require them all to be present
* in a specific order (`*a*b*`).
* Prefix parentheses with a `-` or `!` to require multiple words to be missing (`-(a b)`).
* Add quotes around multiple words to search a specific sentence (`"a b"`).
'''))
sources = serializers.MultipleChoiceField(
[
('element', 'element'),
('transcription', 'transcription'),
('metadata', 'metadata'),
('entity', 'entity')
],
default={'element', 'transcription', 'metadata', 'entity'},
help_text='List of sources to be searched on.',
)
only_facets = serializers.BooleanField(
default=False,
help_text='Returns only the facets. `results` will be set to `null`.',
)
page = serializers.IntegerField(default=1, min_value=1)
sort = serializers.ChoiceField(
choices=['relevance', 'element_name'],
default='relevance',
help_text='Criterion on which to sort search results',
)
# Facets
element_type = serializers.CharField(
required=False,
help_text='Filter by element type name.',
)
element_worker = serializers.CharField(
required=False,
help_text='Filter by name of the worker that created the element.',
)
transcription_worker = serializers.CharField(
required=False,
help_text='Filter by name of the worker that created the transcription.',
)
classification_name = serializers.CharField(
required=False,
help_text='Filter by class name.'
)
classification_worker = serializers.CharField(
required=False,
help_text='Filter by name of the worker that created the classification.',
)
metadata_name = serializers.CharField(
required=False,
help_text='Filter by metadata name.',
)
metadata_type = EnumField(
MetaType,
required=False,
help_text='Filter by metadata type.',
)
metadata_worker = serializers.CharField(
required=False,
help_text='Filter by name of the worker that created the metadata.',
)
entity_type = serializers.CharField(
required=False,
help_text='Filter by entity type.',
)
entity_worker = serializers.CharField(
required=False,
help_text='Filter by name of the worker that created the entity.',
)
class ReindexCorpusSerializer(serializers.Serializer):
drop = serializers.BooleanField(default=True, help_text='Drop existing collections for this corpus.')
def save(self, **kwargs):
corpus_id = self.context.get('corpus_id')
user_id = self.context.get('user_id')
assert corpus_id and user_id, 'corpus_id and user_id must be passed in the serializer context'
# Ensure the reindex job has not already been started
job_id = f'reindex-{corpus_id}'
if (job := get_existing_job(job_id)) is not None:
# A previous job can only be removed if finished
if job.ended_at is None:
raise ValidationError({
'__all__': [f'A job is already running to build search index on corpus {corpus_id}.']
})
job.delete()
reindex_corpus(**self.validated_data, corpus_id=corpus_id, user_id=user_id, job_id=job_id)