diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8be0870007781a8d06662d42e550797487adc1e9..97410fcd4f0eaba2f1cb0ab32d5671270a65e411 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -65,7 +65,7 @@ backend-tests: - codecov backend-lint: - image: python:3 + image: python:3.7 stage: test except: diff --git a/.isort.cfg b/.isort.cfg index c5adfd361a051c85f23fc405e516926f4c37232b..d532ef433bbd25b92e4b31878863e863b7fb4b55 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -8,4 +8,4 @@ line_length = 120 default_section=FIRSTPARTY known_first_party = ponos,transkribus -known_third_party = SolrClient,bleach,boto3,botocore,corsheaders,django,django_admin_hstore_widget,django_rq,drf_spectacular,elasticsearch,elasticsearch_dsl,enumfields,gitlab,psycopg2,requests,responses,rest_framework,rq,setuptools,sqlparse,teklia_toolbox,tenacity,tripoli,yaml +known_third_party = SolrClient,bleach,boto3,botocore,corsheaders,django,django_admin_hstore_widget,django_rq,drf_spectacular,enumfields,gitlab,psycopg2,requests,responses,rest_framework,rq,setuptools,sqlparse,teklia_toolbox,tenacity,tripoli,yaml diff --git a/README.md b/README.md index 1a74dd2afb7d9d83e001d49c0645f64ec7619a83..956b6524063f1d9f8853f53c4205ba4398a1ed4b 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ Aside from the usual Django commands, some custom commands are available via `ma * `import_annotations`: Import index files from a folder into a specific volume; * `import_acts`: Import XML surface files and CSV act files; * `delete_corpus`: Delete a big corpus using a Ponos task; -* `reindex`: Run asynchronous tasks on the Celery worker to reindex transcriptions in ElasticSearch; +* `reindex`: Reindex elements into Solr; * `telegraf`: A special command with InfluxDB-compatible output for Grafana statistics. See `manage.py <command> --help` to view more details about a specific command. @@ -177,6 +177,11 @@ You may want to also uninstall `django-nose`, as it is an optional test runner t We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org/project/django-rq/), to run tasks without blocking an API request or causing timeouts. To call them in Python code, you should use the trigger methods in `arkindex.project.triggers`; those will do some safety checks to make catching some errors easier in dev. The actual tasks are in `arkindex.documents.tasks`. The following tasks exist: * Delete a corpus: `corpus_delete` -* Reindex elements, transcriptions or entities into ElasticSearch: `reindex_start` +* Delete a list of elements: `element_trash` +* Delete worker results (transcriptions, classifications, etc. of a worker version): `worker_results_delete` +* Move an element to another parent: `move_element` +* Create `WorkerActivity` instances for all elements of a process: `intitialize_activity` +* Delete a process and its worker activities: `process_delete` +* Export a corpus to an SQLite database: `export_corpus` To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make slim` or `make` in the architecture will provide it. `make` in the architecture also provides a RQ worker running in Docker from a binary build. diff --git a/arkindex/dataimport/tests/test_utils.py b/arkindex/dataimport/tests/test_utils.py index f74301cb867d9304986966957710282b8a7e2de4..aadb9e8706c7cd09d3f4733f2b97242a7965296d 100644 --- a/arkindex/dataimport/tests/test_utils.py +++ b/arkindex/dataimport/tests/test_utils.py @@ -16,7 +16,6 @@ class TestDataImportUtils(TestCase): @classmethod def setUpTestData(cls): super().setUpTestData() - cls.wheat_farm = Farm.objects.create(name='Wheat farm') cls.corn_farm = Farm.objects.create(id=DEFAULT_FARM_ID, name='Corn farm') cls.barley_farm = Farm.objects.create(name='Barley farm') diff --git a/arkindex/dataimport/tests/test_workeractivity_stats.py b/arkindex/dataimport/tests/test_workeractivity_stats.py index 6e3d8b226633b0ba1e9c2127b98c8dff2b68bc10..3a45d11c6a2b89bd349c965706e6f5ba6add6829 100644 --- a/arkindex/dataimport/tests/test_workeractivity_stats.py +++ b/arkindex/dataimport/tests/test_workeractivity_stats.py @@ -18,7 +18,6 @@ class TestWorkersActivity(FixtureAPITestCase): cls.version_1 = WorkerVersion.objects.get(worker__slug='reco') cls.version_2 = WorkerVersion.objects.get(worker__slug='dla') cls.private_corpus = Corpus.objects.create(name='private', public=False) - cls.elts_count = cls.corpus.elements.count() cls.process = DataImport.objects.create( mode=DataImportMode.Workers, creator=cls.user, diff --git a/arkindex/dataimport/tests/test_workflows_api.py b/arkindex/dataimport/tests/test_workflows_api.py index dbe215306c8e21ed766da0db31beb38c38a92024..738514f85bc5366107a2cc175fe436d6a2d9419f 100644 --- a/arkindex/dataimport/tests/test_workflows_api.py +++ b/arkindex/dataimport/tests/test_workflows_api.py @@ -39,7 +39,7 @@ class TestWorkflows(FixtureAPITestCase): cls.volume = Element.objects.get(name='Volume 1') cls.pages = Element.objects.get_descending(cls.volume.id).filter(type__slug='page', polygon__isnull=False) cls.private_corpus = Corpus.objects.create(name='private') - cls.private_volume = cls.private_corpus.elements.create( + cls.private_corpus.elements.create( type=cls.private_corpus.types.create(slug='volume', folder=True), name='Hidden', ) diff --git a/arkindex/documents/api/entities.py b/arkindex/documents/api/entities.py index 2fe58dfbcbbef460f78bc6334e72c9a79ef7f63f..b800a7772ace73bba6296ec2754180e2e222a04b 100644 --- a/arkindex/documents/api/entities.py +++ b/arkindex/documents/api/entities.py @@ -1,11 +1,9 @@ import logging from uuid import UUID -from django.conf import settings from django.core.exceptions import ValidationError from django.db.models import Q from drf_spectacular.utils import OpenApiExample, OpenApiParameter, extend_schema, extend_schema_view -from elasticsearch.exceptions import NotFoundError from rest_framework import permissions, serializers, status from rest_framework.exceptions import NotFound, PermissionDenied from rest_framework.generics import ( @@ -39,7 +37,6 @@ from arkindex.documents.serializers.entities import ( TranscriptionEntityDetailsSerializer, TranscriptionEntitySerializer, ) -from arkindex.project.elastic import ESEntity from arkindex.project.mixins import ACLMixin, CorpusACLMixin from arkindex.project.permissions import IsVerified, IsVerifiedOrReadOnly from arkindex.users.models import Role @@ -126,19 +123,6 @@ class EntityDetails(ACLMixin, RetrieveUpdateDestroyAPIView): if self.request.method not in permissions.SAFE_METHODS and not self.has_access(obj.corpus, Role.Contributor.value): raise PermissionDenied(detail='You do not have write access to this corpus.') - def perform_destroy(self, instance): - if settings.ARKINDEX_FEATURES['search']: - # Try to delete indexed entity if possible - try: - es_entity = ESEntity.get(id=instance.id.hex) - es_entity.delete() - except NotFoundError: - pass - except Exception as e: - logger.error(f"Failed to delete ES index entity {instance.id}: {e}") - - instance.delete() - @extend_schema_view(get=extend_schema(operation_id='ListEntityElements', tags=['entities'])) class EntityElements(ListAPIView): diff --git a/arkindex/documents/api/iiif.py b/arkindex/documents/api/iiif.py index 7d6ad36fd01ce8ccbdd5b1dccd1050ad15c3b2f2..7a67a3a508a03622c05b26b8b66b6c8b584afd27 100644 --- a/arkindex/documents/api/iiif.py +++ b/arkindex/documents/api/iiif.py @@ -1,19 +1,10 @@ from django.utils.decorators import method_decorator from django.views.decorators.cache import cache_page -from drf_spectacular.utils import extend_schema, extend_schema_view -from rest_framework.exceptions import PermissionDenied +from drf_spectacular.utils import extend_schema from rest_framework.generics import RetrieveAPIView from arkindex.documents.models import Corpus, Element -from arkindex.documents.search import search_transcriptions_filter_post -from arkindex.documents.serializers.iiif import ( - ElementAnnotationListSerializer, - FolderManifestSerializer, - TranscriptionSearchAnnotationListSerializer, -) -from arkindex.documents.serializers.search import IIIFSearchQuerySerializer -from arkindex.project.elastic import ESTranscription -from arkindex.project.mixins import SearchAPIMixin +from arkindex.documents.serializers.iiif import ElementAnnotationListSerializer, FolderManifestSerializer class FolderManifest(RetrieveAPIView): @@ -60,53 +51,3 @@ class ElementAnnotationList(RetrieveAPIView): ) def get(self, *args, **kwargs): return super().get(*args, **kwargs) - - -@extend_schema_view( - get=extend_schema( - operation_id='SearchTranscriptionsAnnotationList', - responses={200: {'type': 'object'}}, - tags=['iiif'], - ) -) -class TranscriptionSearchAnnotationList(SearchAPIMixin, RetrieveAPIView): - """ - Retrieve an IIIF Search API annotation list for transcriptions on a folder element - """ - - serializer_class = TranscriptionSearchAnnotationListSerializer - query_serializer_class = IIIFSearchQuerySerializer - # For OpenAPI type discovery: an element's ID is in the path - queryset = Element.objects.none() - elt = None - - def get_element(self): - if not self.elt: - self.elt = Element.objects.get(id=self.kwargs['pk']) - if self.elt.corpus not in Corpus.objects.readable(self.request.user): - raise PermissionDenied - return self.elt - - def get_object(self): - return self.get_queryset() - - def get_search(self, query=None, min_confidence=0.0, **kwargs): - return ESTranscription.search() \ - .filter('match', corpus=str(self.get_element().corpus_id)) \ - .filter( - 'terms', - element=list( - Element.objects - .filter( - paths__path__overlap=[self.get_element().id], - paths__path__last=self.get_element().id, - type__folder=False, - image_id__isnull=False, - polygon__isnull=False, - ).values_list('id', flat=True) - )) \ - .filter('range', score={'gte': min_confidence}) \ - .query('match', text=query) - - def post_process(self, *args): - return search_transcriptions_filter_post(*args, element_id=self.get_element().id) diff --git a/arkindex/documents/api/search.py b/arkindex/documents/api/search.py index f3bed00c03b97e63363794f37060b2fd684d74be..de95bd753848bc57935dd2658b9c8ec7bd1fcb33 100644 --- a/arkindex/documents/api/search.py +++ b/arkindex/documents/api/search.py @@ -3,126 +3,20 @@ from textwrap import dedent from django.conf import settings from drf_spectacular.utils import OpenApiParameter, extend_schema, extend_schema_view -from elasticsearch_dsl.function import FieldValueFactor -from elasticsearch_dsl.query import FunctionScore, Nested, Q from rest_framework import status from rest_framework.exceptions import NotFound, ValidationError -from rest_framework.generics import ListAPIView from rest_framework.response import Response from rest_framework.utils.urls import replace_query_param from rest_framework.views import APIView from SolrClient import SolrClient from SolrClient.exceptions import SolrError -from arkindex.documents.search import search_elements_post, search_entities_post -from arkindex.documents.serializers.search import ( - CorpusSearchQuerySerializer, - CorpusSearchResultSerializer, - ElementSearchResultSerializer, - EntitySearchQuerySerializer, - EntitySearchResultSerializer, - SearchQuerySerializer, -) -from arkindex.project.elastic import ESElement, ESEntity -from arkindex.project.mixins import CorpusACLMixin, SearchAPIMixin +from arkindex.documents.serializers.search import CorpusSearchQuerySerializer, CorpusSearchResultSerializer +from arkindex.project.mixins import CorpusACLMixin solr = SolrClient(settings.SOLR_API_URL) -@extend_schema_view( - get=extend_schema( - operation_id='SearchElements', - tags=['search'], - parameters=[SearchQuerySerializer], - ) -) -class ElementSearch(SearchAPIMixin, ListAPIView): - """ - Get a list of elements with their parents, the total number of transcriptions - in each element, and a few (not all) of their transcriptions, with their source, - type, zone and image, for a given query. - """ - serializer_class = ElementSearchResultSerializer - - def get_search(self, - corpora_ids=None, - query=None, - element_type=None, - date_lt=None, - date_gte=None, - min_confidence=0.0): - assert corpora_ids, 'Must filter by corpora' - - # Note that sorting by parents will not sort properly if there are multiple parents - # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#_sort_mode_option - search = ESElement.search() \ - .sort('parents', '_score') \ - .source(fields=['date_range']) \ - .filter('terms', corpus=corpora_ids) - - if date_lt or date_gte: - date_range = {'relation': 'intersects'} - if date_lt: - date_range['lt'] = date_lt - if date_gte: - date_range['gte'] = date_gte - - search = search.filter('range', date_range=date_range) - - if element_type: - search = search.filter('match', type=element_type) - - nested_query = Q('range', transcriptions__score={'gte': min_confidence}) - if query: - nested_query &= Q('simple_query_string', query=query, fields=['transcriptions.text']) - - elastic_query = Nested( - path='transcriptions', - inner_hits={'size': settings.ES_INNER_RESULTS_LIMIT}, - score_mode='sum', - query=FunctionScore( - query=nested_query, - functions=[FieldValueFactor(field='transcriptions.score')], - ), - ) - if query: - elastic_query |= Q('wildcard', references='*{}*'.format(query.lower())) - - search = search.query(elastic_query) - return search - - def post_process(self, *args, **kwargs): - return search_elements_post(*args) - - -@extend_schema_view( - get=extend_schema( - operation_id='SearchEntities', - tags=['search'], - parameters=[EntitySearchQuerySerializer], - ) -) -class EntitySearch(SearchAPIMixin, ListAPIView): - serializer_class = EntitySearchResultSerializer - query_serializer_class = EntitySearchQuerySerializer - - def get_search(self, query=None, type=None, corpora_ids=None): - assert corpora_ids, 'Must filter by corpora' - - search = ESEntity.search() \ - .filter('terms', corpus=corpora_ids) - - if query: - search = search.query('simple_query_string', query=query, fields=['name']) - if type: - search = search.query('match', type=type.value) - - return search - - def post_process(self, *args, **kwargs): - return search_entities_post(*args) - - @extend_schema_view( get=extend_schema( operation_id='SearchCorpus', @@ -222,7 +116,7 @@ class CorpusSearch(CorpusACLMixin, APIView): return previous_url, next_url def get(self, request, *args, **kwargs): - if not settings.ARKINDEX_FEATURES['search_v2']: + if not settings.ARKINDEX_FEATURES['search']: raise ValidationError(['Search features are not available on this instance.']) corpus = self.get_corpus(kwargs['pk']) diff --git a/arkindex/documents/dates.py b/arkindex/documents/dates.py index 18eb0244a96ed4c391a926f68eae222f14918d1c..f667fea427a921a370a747e208693f952cac4a9e 100644 --- a/arkindex/documents/dates.py +++ b/arkindex/documents/dates.py @@ -69,42 +69,13 @@ class InterpretedDate(object): continue return s > o - def es_round_up(self): - """ - Return a rounded up date using ElasticSearch date math syntax depending if month/day are present - (https://www.elastic.co/guide/en/elasticsearch/reference/6.2/common-options.html#date-math) - Ex: '1990' date will be represented as '1999||+1y' which means '2000' for ElasticSearch - '2000-01' will return '2000-01||+1M' ('2000-02') - """ - return '{}||+1{}'.format(self.es_str(), self.precision.value) - - def to_es_range(self): - """ - Return a dict containing optional ElasticSearch indexing date-range fields - gte: greater than or equal - lt: lower than - Ex: Exact date 1920-01 will return {'gte': '1920-01', 'lt':'1920-01||+1M'} - Upper bound date 1860 will return {'lt': '1860||+1y'} only - """ - # Round date if needed - es_range = {} - if self.type in (DateType.Lower, DateType.Exact): - es_range['gte'] = self.es_str() - if self.type in (DateType.Upper, DateType.Exact): - es_range['lt'] = self.es_round_up() - return es_range - - def es_str(self): - return '-'.join('{:02d}'.format(e) for e in tuple(self) if e) - def __str__(self): - return self.es_str() + return '-'.join('{:02d}'.format(e) for e in tuple(self) if e) class InterpretedDateMixin(object): """ - Adds on-demand date parsing from a text field to InterpretedDates for - ElasticSearch indexation. + Adds on-demand date parsing from a text field to InterpretedDates. Requires a `raw_dates` property that returns the date string. """ diff --git a/arkindex/documents/indexer.py b/arkindex/documents/indexer.py index e015bf1bc3684fbf0cedce5e2df7f550d2029608..df45b4d067bdcee821bc7a591a74bdb0e0f146c8 100644 --- a/arkindex/documents/indexer.py +++ b/arkindex/documents/indexer.py @@ -1,106 +1,343 @@ import datetime +import itertools +import json import logging -import time +from hashlib import md5 +from uuid import UUID from django.conf import settings -from django.db.models import QuerySet -from elasticsearch import Elasticsearch -from elasticsearch.exceptions import NotFoundError -from elasticsearch.helpers import bulk as es_bulk +from django.db.models import prefetch_related_objects +from SolrClient import SolrClient +from SolrClient.exceptions import SolrError +from teklia_toolbox.time import Timer -from arkindex.project.elastic import ESElement, ESEntity, ESTranscription +from arkindex.documents.models import Element +from arkindex.project.tools import CounterIterator logger = logging.getLogger(__name__) +solr = SolrClient(settings.SOLR_API_URL) + +# The SQL query that yields all indexable elements. +# Both indexable elements and all their children can be indexed, so child elements keep a link to their parent element +# using the parent_id, parent_name and parent_type columns. For parent elements, the columns are therefore duplicated. +PARENT_QUERY = """ +SELECT + element.id AS parent_id, + element.name AS parent_name, + elementtype.display_name AS parent_type, + element.id AS id, + element.name AS name, + elementtype.display_name AS type_name, + element.image_id AS image_id, + element.polygon::bytea AS polygon, + element.worker_version_id AS worker_version_id +FROM documents_element element +INNER JOIN documents_elementtype elementtype ON (elementtype.id = element.type_id) +WHERE element.corpus_id = %(corpus)s +AND elementtype.indexable +ORDER BY element.id +""" + +# The SQL query that yields all child elements of indexable elements. +ELEMENTS_QUERY = f""" +WITH parent AS ({PARENT_QUERY}) +SELECT + parent_id, + parent_name, + parent_type, + element.id as id, + element.name as name, + elementtype.display_name as type_name, + element.image_id AS image_id, + element.polygon::bytea AS polygon, + element.worker_version_id AS worker_version_id +FROM (SELECT * FROM parent LIMIT %(limit)s OFFSET %(offset)s) AS parent_chunk +INNER JOIN documents_elementpath as elementpath ON (elementpath.path @> ARRAY[parent_chunk.id]) +INNER JOIN documents_element as element ON (elementpath.element_id = element.id) +INNER JOIN documents_elementtype elementtype ON (element.type_id = elementtype.id) +""" + class Indexer(object): - documents = ( - ESTranscription, - ESElement, - ESEntity, - ) + # The query yielding all the elements to run on will look for all the child elements of all indexable elements + # The joins can take a very long time, so the query gets split into one to fetch all the indexable elements, + # then one to fetch the child elements of {sql_chunk_size} indexable elements using LIMIT and OFFSET. + sql_chunk_size = 10000 + + # Number of elements to load in Python from all of the SQL queries (can generate many more documents) + elements_chunk_size = 100 + + # Chunk of documents sent to Solr + solr_chunk_size = 200 + solr_num_shards = 1 + solr_options = {'commit': True} + solr_type_fields = [ + {'name': 'uuid', 'class': 'solr.UUIDField'}, + # Update string analyzer to support case-insensitive searching + { + 'name': 'string', + 'class': 'solr.TextField', + 'analyzer': { + 'tokenizer': {'class': 'solr.KeywordTokenizerFactory'}, + 'filters': [{'class': 'solr.LowerCaseFilterFactory'}] + } + } + ] + solr_fields = [ + # Parent fields + {'name': 'parent_id', 'indexed': False, 'required': True, 'type': 'uuid'}, + {'name': 'parent_name', 'indexed': False, 'required': True, 'type': 'string'}, + {'name': 'parent_type', 'indexed': False, 'required': True, 'type': 'string'}, + # Element fields + {'name': 'element_id', 'indexed': False, 'required': True, 'type': 'uuid'}, + {'name': 'element_text', 'indexed': True, 'required': True, 'type': 'string'}, + {'name': 'element_type', 'indexed': True, 'required': True, 'type': 'string'}, + {'name': 'element_worker', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'element_image', 'indexed': False, 'required': False, 'type': 'string'}, + # Transcription fields + {'name': 'transcription_id', 'indexed': False, 'required': False, 'type': 'uuid'}, + {'name': 'transcription_text', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'transcription_confidence', 'indexed': True, 'required': False, 'type': 'pfloat'}, + {'name': 'transcription_worker', 'indexed': True, 'required': False, 'type': 'string'}, + # Classification fields + {'name': 'classification_id', 'indexed': False, 'required': False, 'type': 'uuid'}, + {'name': 'classification_name', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'classification_confidence', 'indexed': True, 'required': False, 'type': 'pfloat'}, + {'name': 'classification_worker', 'indexed': True, 'required': False, 'type': 'string'}, + # Metadata fields + {'name': 'metadata_id', 'indexed': False, 'required': False, 'type': 'uuid'}, + {'name': 'metadata_name', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'metadata_text', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'metadata_type', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'metadata_worker', 'indexed': True, 'required': False, 'type': 'string'}, + # Entity fields + {'name': 'entity_id', 'indexed': False, 'required': False, 'type': 'uuid'}, + {'name': 'entity_text', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'entity_type', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'entity_worker', 'indexed': True, 'required': False, 'type': 'string'} + ] - def __init__(self, hosts=settings.ELASTIC_SEARCH_HOSTS): - self.elastic = Elasticsearch(hosts) + def __init__(self, corpus_id): + self.corpus_id = corpus_id + self.collection_name = f'project-{self.corpus_id}' def setup(self): """ - Create indexes in ElasticSearch + Create collection in Solr """ - for document in self.documents: - logger.info('Creating or updating index for {}'.format(document.__name__)) - document.init() + if not solr.collections.exists(self.collection_name): + logger.info(f'Creating collection {self.collection_name}') + solr.collections.create(self.collection_name, self.solr_num_shards) + else: + logger.info(f'Collection {self.collection_name} already exists') - def drop_index(self, document): - """ - Try to drop an existing index - """ - try: - document._index.delete() - logger.info("Dropped index for {}".format(document.__name__)) - except NotFoundError: - logger.info("Could not drop index for {} (does not exist)".format(document.__name__)) + logger.info(f'Updating field types for {self.collection_name}') + # Disable SolrClient logging + logging.disable(logging.ERROR) + for type_field in self.solr_type_fields: + # SolrClient does not support field type management + params = { + 'method': 'POST', + 'endpoint': solr.schema.schema_endpoint, + 'collection': self.collection_name, + } + try: + solr.transport.send_request(**params, data=json.dumps({'add-field-type': type_field})) + except SolrError as e: + if f"Field type '{type_field['name']}' already exists." not in str(e): + raise e + solr.transport.send_request(**params, data=json.dumps({'replace-field-type': type_field})) + # Restore logging + logging.disable(logging.NOTSET) + + logger.info(f'Updating fields for {self.collection_name}') + # Disable SolrClient logging + logging.disable(logging.INFO) + for field in self.solr_fields: + if not solr.schema.does_field_exist(self.collection_name, field['name']): + solr.schema.create_field(self.collection_name, field) + else: + solr.schema.replace_field(self.collection_name, field) + # Restore logging + logging.disable(logging.NOTSET) - def run_index(self, items, bulk_size=400): + def drop_index(self): """ - Reindex elements from a QuerySet or list chunk by chunk and output status + Drop an existing collection """ - if isinstance(items, QuerySet): - count = items.count() - else: - count = len(items) - logger.info("{} items to reindex".format(count)) - - start_time, last, indexed = time.time(), 0, 0 - for i in range(0, count, bulk_size): - percent = int(100.0 * i / count) - if last != percent or bulk_size > 100: - secs = time.time() - start_time - per_second = int(i / secs) - logger.info( - '{0: >3}% => {1: >5} / {2} (indexed: {3}, execution time: {4}, {5} items/s)'.format( - percent, i, count, indexed, datetime.timedelta(seconds=secs), per_second)) - last = percent - - indexed += self.index(items[i:i + bulk_size]) - - def index(self, items, retries=3, chunk_size=100, timeout=15): + solr.delete_doc_by_query(self.collection_name, '*:*', **self.solr_options) + logger.info(f'Dropped index for {self.collection_name}') + + def get_elements(self): + # First make the query that returns all indexable elements. + parents = Element.objects.raw(PARENT_QUERY, params={'corpus': self.corpus_id}).iterator() + # Keep track of the parents count without using SELECT COUNT(*) or storing everything in RAM, + # as we will need it to split the next queries into chunks. + iterator = CounterIterator(parents) + yield from iterator + + # Execute one SQL query for each chunk of parent indexable elements. + # This query returns one row for each child element of each parent element, + # which can quickly makes joins very slow; chunking makes the issue manageable. + # We use LIMIT and OFFSET to perform the chunking as sending boatloads of element UUIDs + # back and forth between Python and the DB can be quite slow. + for offset in range(0, len(iterator), self.sql_chunk_size): + yield from Element.objects.raw(ELEMENTS_QUERY, params={ + 'corpus': self.corpus_id, + 'limit': self.sql_chunk_size, + 'offset': offset, + }).iterator() + + def elements_chunk(self, elements): + while True: + chunk = tuple(itertools.islice(elements, self.elements_chunk_size)) + if not chunk: + return + yield chunk + + def hash_worker(self, worker_version): + if not worker_version: + return + + return worker_version.worker.name + + def build_solr_id(self, element, target): + hash = md5(element.id.bytes + target.id.bytes) + return UUID(hash.hexdigest()) + + def build_documents(self, elements): + BUILD_METHODS = [self.build_transcriptions, self.build_classifications, self.build_metadatas, self.build_entities] + documents = [] + for element in elements: + document = self.build_element(element) + documents.append(document) + for method in BUILD_METHODS: + documents += method(element, document) + return documents + + def build_element(self, element): + return { + 'id': str(element.id), + 'parent_id': str(element.parent_id), + 'parent_name': element.parent_name, + 'parent_type': element.parent_type, + 'element_id': str(element.id), + 'element_text': element.name, + 'element_type': element.type_name, + 'element_worker': self.hash_worker(element.worker_version), + 'element_image': element.iiif_thumbnail_url + } + + def build_transcriptions(self, element, document): + return [ + dict(document, **{ + 'id': str(self.build_solr_id(element, transcription)), + 'transcription_id': str(transcription.id), + 'transcription_text': transcription.text, + 'transcription_confidence': transcription.confidence, + 'transcription_worker': self.hash_worker(transcription.worker_version) + }) for transcription in element.transcriptions.all() + ] + + def build_classifications(self, element, document): + return [ + dict(document, **{ + 'id': str(self.build_solr_id(element, classification)), + 'classification_id': str(classification.id), + 'classification_name': classification.ml_class.name, + 'classification_confidence': classification.confidence, + 'classification_worker': self.hash_worker(classification.worker_version) + }) for classification in element.classifications.all() + ] + + def build_metadatas(self, element, document): + return [ + dict(document, **{ + 'id': str(self.build_solr_id(element, metadata)), + 'metadata_id': str(metadata.id), + 'metadata_name': metadata.name, + 'metadata_text': metadata.value, + 'metadata_type': metadata.type.value, + 'metadata_worker': self.hash_worker(metadata.worker_version) + }) for metadata in element.metadatas.all() + ] + + def build_entities(self, element, document): + entities = [entity for transcription in element.transcriptions.all() for entity in transcription.entities.all()] + entities += [metadata.entity for metadata in element.metadatas.all() if metadata.entity] + return [ + dict(document, **{ + 'id': str(self.build_solr_id(element, entity)), + 'entity_id': str(entity.id), + 'entity_text': entity.name, + 'entity_type': entity.type.value, + 'entity_worker': self.hash_worker(entity.worker_version) + }) for entity in entities + ] + + def index(self): """ - Insert or update items into ElasticSearch + Insert items into Solr + Process elements with an indexable type and their children by chunk + For each chunk: + - load their dependencies (transcriptions, classifications...) + - serialize items in a Solr document + - send the documents to Solr """ - assert retries > 0 - if isinstance(items, QuerySet): - items = list(items) - - # Build raw ElasticSearch insert - actions = [ - item.es_document.from_model(item).to_dict(include_meta=True, skip_empty=False) - for item in items - ] + elements = self.get_elements() + + total_elements, total_documents = 0, 0 + retrieve_time, build_time, index_time = datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0) + + for elements_chunk in self.elements_chunk(elements): + nb_elements = len(elements_chunk) + total_elements += nb_elements + logger.debug(f'Processing {nb_elements} elements...') + # Retrieve elements from db + with Timer() as t: + # Element + prefetch_related_objects(elements_chunk, 'image__server') + prefetch_related_objects(elements_chunk, 'worker_version__worker') + # Transcriptions + prefetch_related_objects(elements_chunk, 'transcriptions') + prefetch_related_objects(elements_chunk, 'transcriptions__worker_version__worker') + # Classifications + prefetch_related_objects(elements_chunk, 'classifications') + prefetch_related_objects(elements_chunk, 'classifications__worker_version__worker') + # Metadatas + prefetch_related_objects(elements_chunk, 'metadatas') + prefetch_related_objects(elements_chunk, 'metadatas__worker_version__worker') + # Entities + prefetch_related_objects(elements_chunk, 'transcriptions__entities') + prefetch_related_objects(elements_chunk, 'transcriptions__entities__worker_version__worker') + prefetch_related_objects(elements_chunk, 'metadatas__entity') + prefetch_related_objects(elements_chunk, 'metadatas__entity__worker_version__worker') + retrieve_time += t.delta + logger.debug(f'Retrieved {nb_elements} elements') + + # Build Solr documents + with Timer() as t: + documents = self.build_documents(elements_chunk) + nb_documents = len(documents) + total_documents += nb_documents + build_time += t.delta + logger.debug(f'Built {nb_documents} Solr documents') + + # Index documents into Solr + with Timer() as t: + for i in range(0, nb_documents, self.solr_chunk_size): + solr.index(self.collection_name, documents[i:i + self.solr_chunk_size], **self.solr_options) + index_time += t.delta + logger.debug(f'Indexed {nb_documents} documents into Solr') + + result = solr.query(self.collection_name, {'q': '*:*'}) + logger.info(f'Currently {result.get_num_found()} documents in Solr') + + logger.info(f'Retrieved {total_elements} elements in {retrieve_time}') + logger.info(f'Built {total_documents} Solr documents in {build_time}') + logger.info(f'Indexed {total_documents} documents into Solr in {index_time}') - # Run actions in bulk - try: - nb_insert, _ = es_bulk( - self.elastic, - actions, - chunk_size=chunk_size, - initial_backoff=2, - max_retries=2, - request_timeout=timeout, - stats_only=True, - _source=False, - ) - except Exception as e: - if retries <= 1: - logger.warning('Failed to bulk insert into ES : {}'.format(e)) - return 0 - logger.warning('Failed to bulk insert into ES - retrying : {}'.format(e)) - return self.index( - items, - retries=retries - 1, - chunk_size=max(chunk_size // 2, 1), - timeout=timeout * 2, - ) - - return nb_insert + result = solr.query(self.collection_name, {'q': '*:*'}) + logger.info(f'Currently {result.get_num_found()} documents in Solr') diff --git a/arkindex/documents/indexer_v2.py b/arkindex/documents/indexer_v2.py deleted file mode 100644 index df45b4d067bdcee821bc7a591a74bdb0e0f146c8..0000000000000000000000000000000000000000 --- a/arkindex/documents/indexer_v2.py +++ /dev/null @@ -1,343 +0,0 @@ -import datetime -import itertools -import json -import logging -from hashlib import md5 -from uuid import UUID - -from django.conf import settings -from django.db.models import prefetch_related_objects -from SolrClient import SolrClient -from SolrClient.exceptions import SolrError -from teklia_toolbox.time import Timer - -from arkindex.documents.models import Element -from arkindex.project.tools import CounterIterator - -logger = logging.getLogger(__name__) - -solr = SolrClient(settings.SOLR_API_URL) - -# The SQL query that yields all indexable elements. -# Both indexable elements and all their children can be indexed, so child elements keep a link to their parent element -# using the parent_id, parent_name and parent_type columns. For parent elements, the columns are therefore duplicated. -PARENT_QUERY = """ -SELECT - element.id AS parent_id, - element.name AS parent_name, - elementtype.display_name AS parent_type, - element.id AS id, - element.name AS name, - elementtype.display_name AS type_name, - element.image_id AS image_id, - element.polygon::bytea AS polygon, - element.worker_version_id AS worker_version_id -FROM documents_element element -INNER JOIN documents_elementtype elementtype ON (elementtype.id = element.type_id) -WHERE element.corpus_id = %(corpus)s -AND elementtype.indexable -ORDER BY element.id -""" - -# The SQL query that yields all child elements of indexable elements. -ELEMENTS_QUERY = f""" -WITH parent AS ({PARENT_QUERY}) -SELECT - parent_id, - parent_name, - parent_type, - element.id as id, - element.name as name, - elementtype.display_name as type_name, - element.image_id AS image_id, - element.polygon::bytea AS polygon, - element.worker_version_id AS worker_version_id -FROM (SELECT * FROM parent LIMIT %(limit)s OFFSET %(offset)s) AS parent_chunk -INNER JOIN documents_elementpath as elementpath ON (elementpath.path @> ARRAY[parent_chunk.id]) -INNER JOIN documents_element as element ON (elementpath.element_id = element.id) -INNER JOIN documents_elementtype elementtype ON (element.type_id = elementtype.id) -""" - - -class Indexer(object): - - # The query yielding all the elements to run on will look for all the child elements of all indexable elements - # The joins can take a very long time, so the query gets split into one to fetch all the indexable elements, - # then one to fetch the child elements of {sql_chunk_size} indexable elements using LIMIT and OFFSET. - sql_chunk_size = 10000 - - # Number of elements to load in Python from all of the SQL queries (can generate many more documents) - elements_chunk_size = 100 - - # Chunk of documents sent to Solr - solr_chunk_size = 200 - solr_num_shards = 1 - solr_options = {'commit': True} - solr_type_fields = [ - {'name': 'uuid', 'class': 'solr.UUIDField'}, - # Update string analyzer to support case-insensitive searching - { - 'name': 'string', - 'class': 'solr.TextField', - 'analyzer': { - 'tokenizer': {'class': 'solr.KeywordTokenizerFactory'}, - 'filters': [{'class': 'solr.LowerCaseFilterFactory'}] - } - } - ] - solr_fields = [ - # Parent fields - {'name': 'parent_id', 'indexed': False, 'required': True, 'type': 'uuid'}, - {'name': 'parent_name', 'indexed': False, 'required': True, 'type': 'string'}, - {'name': 'parent_type', 'indexed': False, 'required': True, 'type': 'string'}, - # Element fields - {'name': 'element_id', 'indexed': False, 'required': True, 'type': 'uuid'}, - {'name': 'element_text', 'indexed': True, 'required': True, 'type': 'string'}, - {'name': 'element_type', 'indexed': True, 'required': True, 'type': 'string'}, - {'name': 'element_worker', 'indexed': True, 'required': False, 'type': 'string'}, - {'name': 'element_image', 'indexed': False, 'required': False, 'type': 'string'}, - # Transcription fields - {'name': 'transcription_id', 'indexed': False, 'required': False, 'type': 'uuid'}, - {'name': 'transcription_text', 'indexed': True, 'required': False, 'type': 'string'}, - {'name': 'transcription_confidence', 'indexed': True, 'required': False, 'type': 'pfloat'}, - {'name': 'transcription_worker', 'indexed': True, 'required': False, 'type': 'string'}, - # Classification fields - {'name': 'classification_id', 'indexed': False, 'required': False, 'type': 'uuid'}, - {'name': 'classification_name', 'indexed': True, 'required': False, 'type': 'string'}, - {'name': 'classification_confidence', 'indexed': True, 'required': False, 'type': 'pfloat'}, - {'name': 'classification_worker', 'indexed': True, 'required': False, 'type': 'string'}, - # Metadata fields - {'name': 'metadata_id', 'indexed': False, 'required': False, 'type': 'uuid'}, - {'name': 'metadata_name', 'indexed': True, 'required': False, 'type': 'string'}, - {'name': 'metadata_text', 'indexed': True, 'required': False, 'type': 'string'}, - {'name': 'metadata_type', 'indexed': True, 'required': False, 'type': 'string'}, - {'name': 'metadata_worker', 'indexed': True, 'required': False, 'type': 'string'}, - # Entity fields - {'name': 'entity_id', 'indexed': False, 'required': False, 'type': 'uuid'}, - {'name': 'entity_text', 'indexed': True, 'required': False, 'type': 'string'}, - {'name': 'entity_type', 'indexed': True, 'required': False, 'type': 'string'}, - {'name': 'entity_worker', 'indexed': True, 'required': False, 'type': 'string'} - ] - - def __init__(self, corpus_id): - self.corpus_id = corpus_id - self.collection_name = f'project-{self.corpus_id}' - - def setup(self): - """ - Create collection in Solr - """ - if not solr.collections.exists(self.collection_name): - logger.info(f'Creating collection {self.collection_name}') - solr.collections.create(self.collection_name, self.solr_num_shards) - else: - logger.info(f'Collection {self.collection_name} already exists') - - logger.info(f'Updating field types for {self.collection_name}') - # Disable SolrClient logging - logging.disable(logging.ERROR) - for type_field in self.solr_type_fields: - # SolrClient does not support field type management - params = { - 'method': 'POST', - 'endpoint': solr.schema.schema_endpoint, - 'collection': self.collection_name, - } - try: - solr.transport.send_request(**params, data=json.dumps({'add-field-type': type_field})) - except SolrError as e: - if f"Field type '{type_field['name']}' already exists." not in str(e): - raise e - solr.transport.send_request(**params, data=json.dumps({'replace-field-type': type_field})) - # Restore logging - logging.disable(logging.NOTSET) - - logger.info(f'Updating fields for {self.collection_name}') - # Disable SolrClient logging - logging.disable(logging.INFO) - for field in self.solr_fields: - if not solr.schema.does_field_exist(self.collection_name, field['name']): - solr.schema.create_field(self.collection_name, field) - else: - solr.schema.replace_field(self.collection_name, field) - # Restore logging - logging.disable(logging.NOTSET) - - def drop_index(self): - """ - Drop an existing collection - """ - solr.delete_doc_by_query(self.collection_name, '*:*', **self.solr_options) - logger.info(f'Dropped index for {self.collection_name}') - - def get_elements(self): - # First make the query that returns all indexable elements. - parents = Element.objects.raw(PARENT_QUERY, params={'corpus': self.corpus_id}).iterator() - # Keep track of the parents count without using SELECT COUNT(*) or storing everything in RAM, - # as we will need it to split the next queries into chunks. - iterator = CounterIterator(parents) - yield from iterator - - # Execute one SQL query for each chunk of parent indexable elements. - # This query returns one row for each child element of each parent element, - # which can quickly makes joins very slow; chunking makes the issue manageable. - # We use LIMIT and OFFSET to perform the chunking as sending boatloads of element UUIDs - # back and forth between Python and the DB can be quite slow. - for offset in range(0, len(iterator), self.sql_chunk_size): - yield from Element.objects.raw(ELEMENTS_QUERY, params={ - 'corpus': self.corpus_id, - 'limit': self.sql_chunk_size, - 'offset': offset, - }).iterator() - - def elements_chunk(self, elements): - while True: - chunk = tuple(itertools.islice(elements, self.elements_chunk_size)) - if not chunk: - return - yield chunk - - def hash_worker(self, worker_version): - if not worker_version: - return - - return worker_version.worker.name - - def build_solr_id(self, element, target): - hash = md5(element.id.bytes + target.id.bytes) - return UUID(hash.hexdigest()) - - def build_documents(self, elements): - BUILD_METHODS = [self.build_transcriptions, self.build_classifications, self.build_metadatas, self.build_entities] - documents = [] - for element in elements: - document = self.build_element(element) - documents.append(document) - for method in BUILD_METHODS: - documents += method(element, document) - return documents - - def build_element(self, element): - return { - 'id': str(element.id), - 'parent_id': str(element.parent_id), - 'parent_name': element.parent_name, - 'parent_type': element.parent_type, - 'element_id': str(element.id), - 'element_text': element.name, - 'element_type': element.type_name, - 'element_worker': self.hash_worker(element.worker_version), - 'element_image': element.iiif_thumbnail_url - } - - def build_transcriptions(self, element, document): - return [ - dict(document, **{ - 'id': str(self.build_solr_id(element, transcription)), - 'transcription_id': str(transcription.id), - 'transcription_text': transcription.text, - 'transcription_confidence': transcription.confidence, - 'transcription_worker': self.hash_worker(transcription.worker_version) - }) for transcription in element.transcriptions.all() - ] - - def build_classifications(self, element, document): - return [ - dict(document, **{ - 'id': str(self.build_solr_id(element, classification)), - 'classification_id': str(classification.id), - 'classification_name': classification.ml_class.name, - 'classification_confidence': classification.confidence, - 'classification_worker': self.hash_worker(classification.worker_version) - }) for classification in element.classifications.all() - ] - - def build_metadatas(self, element, document): - return [ - dict(document, **{ - 'id': str(self.build_solr_id(element, metadata)), - 'metadata_id': str(metadata.id), - 'metadata_name': metadata.name, - 'metadata_text': metadata.value, - 'metadata_type': metadata.type.value, - 'metadata_worker': self.hash_worker(metadata.worker_version) - }) for metadata in element.metadatas.all() - ] - - def build_entities(self, element, document): - entities = [entity for transcription in element.transcriptions.all() for entity in transcription.entities.all()] - entities += [metadata.entity for metadata in element.metadatas.all() if metadata.entity] - return [ - dict(document, **{ - 'id': str(self.build_solr_id(element, entity)), - 'entity_id': str(entity.id), - 'entity_text': entity.name, - 'entity_type': entity.type.value, - 'entity_worker': self.hash_worker(entity.worker_version) - }) for entity in entities - ] - - def index(self): - """ - Insert items into Solr - Process elements with an indexable type and their children by chunk - For each chunk: - - load their dependencies (transcriptions, classifications...) - - serialize items in a Solr document - - send the documents to Solr - """ - elements = self.get_elements() - - total_elements, total_documents = 0, 0 - retrieve_time, build_time, index_time = datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0) - - for elements_chunk in self.elements_chunk(elements): - nb_elements = len(elements_chunk) - total_elements += nb_elements - logger.debug(f'Processing {nb_elements} elements...') - # Retrieve elements from db - with Timer() as t: - # Element - prefetch_related_objects(elements_chunk, 'image__server') - prefetch_related_objects(elements_chunk, 'worker_version__worker') - # Transcriptions - prefetch_related_objects(elements_chunk, 'transcriptions') - prefetch_related_objects(elements_chunk, 'transcriptions__worker_version__worker') - # Classifications - prefetch_related_objects(elements_chunk, 'classifications') - prefetch_related_objects(elements_chunk, 'classifications__worker_version__worker') - # Metadatas - prefetch_related_objects(elements_chunk, 'metadatas') - prefetch_related_objects(elements_chunk, 'metadatas__worker_version__worker') - # Entities - prefetch_related_objects(elements_chunk, 'transcriptions__entities') - prefetch_related_objects(elements_chunk, 'transcriptions__entities__worker_version__worker') - prefetch_related_objects(elements_chunk, 'metadatas__entity') - prefetch_related_objects(elements_chunk, 'metadatas__entity__worker_version__worker') - retrieve_time += t.delta - logger.debug(f'Retrieved {nb_elements} elements') - - # Build Solr documents - with Timer() as t: - documents = self.build_documents(elements_chunk) - nb_documents = len(documents) - total_documents += nb_documents - build_time += t.delta - logger.debug(f'Built {nb_documents} Solr documents') - - # Index documents into Solr - with Timer() as t: - for i in range(0, nb_documents, self.solr_chunk_size): - solr.index(self.collection_name, documents[i:i + self.solr_chunk_size], **self.solr_options) - index_time += t.delta - logger.debug(f'Indexed {nb_documents} documents into Solr') - - result = solr.query(self.collection_name, {'q': '*:*'}) - logger.info(f'Currently {result.get_num_found()} documents in Solr') - - logger.info(f'Retrieved {total_elements} elements in {retrieve_time}') - logger.info(f'Built {total_documents} Solr documents in {build_time}') - logger.info(f'Indexed {total_documents} documents into Solr in {index_time}') - - result = solr.query(self.collection_name, {'q': '*:*'}) - logger.info(f'Currently {result.get_num_found()} documents in Solr') diff --git a/arkindex/documents/management/commands/reindex.py b/arkindex/documents/management/commands/reindex.py index 1b515b2f74b3e4ce88d9127e13a29c1988c1fc64..e795be6ed5682ea871764185b18dfb9dcb44908d 100644 --- a/arkindex/documents/management/commands/reindex.py +++ b/arkindex/documents/management/commands/reindex.py @@ -1,140 +1,50 @@ #!/usr/bin/env python3 -import logging +import uuid from django.conf import settings from django.core.management.base import BaseCommand, CommandError from arkindex.documents.indexer import Indexer -from arkindex.documents.models import Element, Entity, Transcription -from arkindex.project.argparse import CorpusArgument, ElementArgument - -logging.basicConfig( - level=logging.INFO, - format='[%(levelname)s] %(message)s', -) -logger = logging.getLogger(__name__) - - -def get_transcriptions(corpus=None, folder=None): - if folder: - # Lookup all the transcriptions linked to a folder - queryset = Transcription.objects.filter( - element__in=Element.objects.get_descending(folder.id) - ).distinct() - elif corpus: - queryset = Transcription.objects.filter(element__corpus=corpus) - else: - queryset = Transcription.objects.all() - - return queryset.select_related('element') - - -def get_elements(corpus=None, folder=None): - if folder: - queryset = Element.objects.get_descending(folder.id) - elif corpus: - queryset = Element.objects.filter(corpus=corpus) - else: - queryset = Element.objects.all() - - return queryset.select_related('type').prefetch_related('metadatas', 'transcriptions') - - -def get_entities(corpus=None, folder=None): - if folder: - return Entity.objects.filter( - metadatas__element__in=Element.objects.get_descending(folder.id) - ).distinct() - elif corpus: - return Entity.objects.filter(corpus=corpus) - - return Entity.objects.all() +from arkindex.documents.models import Corpus class Command(BaseCommand): - help = 'Selectively reindex transcriptions, elements and entities into ElasticSearch' - - # Setup for reindexation of different Elements - index_methods = { - 'elements': { - 'bulk_size': 100, - 'model': Element, - 'items': get_elements, - }, - 'transcriptions': { - 'bulk_size': 400, - 'model': Transcription, - 'items': get_transcriptions, - }, - 'entities': { - 'bulk_size': 400, - 'model': Entity, - 'items': get_entities, - } - } + help = 'Reindex elements, transcriptions and classifications into Solr' def add_arguments(self, parser): super().add_arguments(parser) parser.add_argument( - '-ts', '--transcriptions', - help='Reindex transcriptions', - action='store_true', - ) - parser.add_argument( - '--elements', - help='Reindex elements', - action='store_true', + "corpus_id", help="UUID of an existing corpus to reindex", type=uuid.UUID ) parser.add_argument( - '--entities', - help='Reindex entities', + '--drop', + help="Drop the existing indexes before reindexing", action='store_true', ) parser.add_argument( - '--corpus', - help='Restrict reindexing to a specific corpus by ID or part of the name', - type=CorpusArgument(), - ) - parser.add_argument( - '--folder', - help='Restrict reindexing to a specific folder by ID or part of the name', - type=ElementArgument(type__folder=True), - ) - parser.add_argument( - '--drop', - help="Drop the existing indexes before reindexing", + '--setup', + help="Only setup a collection. Create a collection and fields if they do not exist or update the fields", action='store_true', ) - def handle(self, folder=None, corpus=None, **options): + def handle(self, corpus_id, **options): if not settings.ARKINDEX_FEATURES['search']: raise CommandError('Reindexation is not possible if the search feature flag is disabled. ' 'Consider setting `features.search` to `on` or `true` or `yes` in the YAML ' - 'configuration file, and configuring ElasticSearch properly.') - - if corpus and folder and folder.corpus != corpus: - raise CommandError('Folder {} is not in corpus {}'.format(folder, corpus)) + 'configuration file, and configuring Solr properly.') - # If no specific index was set, reindex everything - if not any(options.get(k) for k in self.index_methods.keys()): - methods = self.index_methods.keys() - else: - methods = list(filter(options.get, self.index_methods.keys())) + try: + corpus = Corpus.objects.get(id=corpus_id) + except Corpus.DoesNotExist: + raise CommandError(f'Corpus {corpus_id} does not exist') - indexer = Indexer() - if options.get('drop'): - for method in methods: - config = self.index_methods[method] - indexer.drop_index(config['model'].es_document) + if not corpus.indexable: + raise CommandError(f'Corpus {corpus.name} is not indexable') + indexer = Indexer(corpus.id) indexer.setup() - - for method in methods: - config = self.index_methods[method] - - items = config['items'](corpus=corpus, folder=folder) - - indexer.run_index( - items, - bulk_size=config['bulk_size'], - ) + if options.get('setup'): + return + if options.get('drop'): + indexer.drop_index() + indexer.index() diff --git a/arkindex/documents/management/commands/reindex_v2.py b/arkindex/documents/management/commands/reindex_v2.py deleted file mode 100644 index fad4a37f00333138133915f5ec2bf21d0e98c35e..0000000000000000000000000000000000000000 --- a/arkindex/documents/management/commands/reindex_v2.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -import uuid - -from django.conf import settings -from django.core.management.base import BaseCommand, CommandError - -from arkindex.documents.indexer_v2 import Indexer -from arkindex.documents.models import Corpus - - -class Command(BaseCommand): - help = 'Reindex elements, transcriptions and classifications into Solr' - - def add_arguments(self, parser): - super().add_arguments(parser) - parser.add_argument( - "corpus_id", help="UUID of an existing corpus to reindex", type=uuid.UUID - ) - parser.add_argument( - '--drop', - help="Drop the existing indexes before reindexing", - action='store_true', - ) - parser.add_argument( - '--setup', - help="Only setup a collection. Create a collection and fields if they do not exist or update the fields", - action='store_true', - ) - - def handle(self, corpus_id, **options): - if not settings.ARKINDEX_FEATURES['search_v2']: - raise CommandError('Reindexation is not possible if the search feature flag is disabled. ' - 'Consider setting `features.search_v2` to `on` or `true` or `yes` in the YAML ' - 'configuration file, and configuring Solr properly.') - - try: - corpus = Corpus.objects.get(id=corpus_id) - except Corpus.DoesNotExist: - raise CommandError(f'Corpus {corpus_id} does not exist') - - if not corpus.indexable: - raise CommandError(f'Corpus {corpus.name} is not indexable') - - indexer = Indexer(corpus.id) - indexer.setup() - if options.get('setup'): - return - if options.get('drop'): - indexer.drop_index() - indexer.index() diff --git a/arkindex/documents/managers.py b/arkindex/documents/managers.py index 9d577d78c5f8ad31b3c86f07e2f8f49733291241..de754299ac8b47ec1de2592d92ea1e92dbfc5891 100644 --- a/arkindex/documents/managers.py +++ b/arkindex/documents/managers.py @@ -1,5 +1,5 @@ import uuid -from itertools import chain, groupby +from itertools import chain import django from django.db import DJANGO_VERSION_PICKLE_KEY, connections, models @@ -120,43 +120,6 @@ class ElementManager(models.Manager): return self.filter(id__in=parent_ids) - def get_ascendings_paths(self, *children_ids, **filters): - """ - Get all ascending paths for some elements IDs. - """ - from arkindex.documents.models import ElementPath - # Load all parents - parents = { - parent.id: parent - for parent in self.filter( - **filters, - id__in=ElementPath - .objects - .filter(element_id__in=children_ids) - .annotate(parent_id=Unnest('path')) - .values('parent_id') - ) - } - - # Loads paths and group them by element ids - paths = ElementPath.objects.filter(element_id__in=children_ids).order_by('element_id') - tree = { - elt_id: [p.path for p in elt_paths] - for elt_id, elt_paths in groupby(paths, lambda e: e.element_id) - } - - # Put Element instances in paths - return { - elt_id: [ - [ - parents[parent_id] - for parent_id in path - ] - for path in paths - ] - for elt_id, paths in tree.items() - } - def get_descending(self, parent_id, **filters): """ Get all child elements for a specific element ID. diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 25994e38ecfacef45e50013c19e772075f26bfe3..e9d13d9e2d9294f8a3a8d961b709320c94f282f3 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -21,7 +21,6 @@ from arkindex.documents.deletion import delete_element from arkindex.documents.managers import CorpusManager, ElementManager from arkindex.project.aws import S3FileMixin from arkindex.project.default_corpus import DEFAULT_CORPUS_TYPES, DEFAULT_TRANSKRIBUS_TYPES -from arkindex.project.elastic import ESElement, ESEntity, ESTranscription from arkindex.project.fields import ArrayField, LinearRingField from arkindex.project.models import IndexableModel @@ -166,7 +165,6 @@ class Element(IndexableModel): help_text='Mirror the image along the vertical axis before rotating.', ) - es_document = ESElement objects = ElementManager() class Meta: @@ -381,7 +379,6 @@ class Entity(InterpretedDateMixin, models.Model): """ Semantic object in arkindex """ - es_document = ESEntity id = models.UUIDField(default=uuid.uuid4, primary_key=True, editable=False) name = models.TextField() @@ -479,8 +476,6 @@ class Transcription(models.Model): A transcription on: * a zone on an image """ - es_document = ESTranscription - id = models.UUIDField(default=uuid.uuid4, primary_key=True, editable=False) element = models.ForeignKey( Element, diff --git a/arkindex/documents/search.py b/arkindex/documents/search.py deleted file mode 100644 index 36e1c6b9424e1999448e8f0a54a43838131de907..0000000000000000000000000000000000000000 --- a/arkindex/documents/search.py +++ /dev/null @@ -1,102 +0,0 @@ -import uuid -from itertools import chain - -from django.db.models import prefetch_related_objects - -from arkindex.documents.models import Element, Entity, Transcription - - -def search_transcriptions_post(data): - ''' - Search pages with a transcription - List pages ordered by document - ''' - transcription_ids = [r.meta.id for r in data] - ts = Transcription.objects \ - .filter(id__in=transcription_ids) \ - .order_by('-confidence') \ - .select_related('element__image__server', 'worker_version') - element_ids = list(ts.values_list('element_id', flat=True)) - all_parent_paths = Element.objects.get_ascendings_paths(*element_ids) - for trans in ts: - trans.parent_paths = all_parent_paths.get(trans.element_id, []) - return ts - - -def search_transcriptions_filter_post(data, element_id): - if not isinstance(element_id, uuid.UUID): - element_id = uuid.UUID(element_id) - return filter( - lambda t: element_id in [p.id for p in chain(*t.parent_paths)], - search_transcriptions_post(data) - ) - - -def search_entities_post(data): - """ - Search entities containing query - """ - entity_ids = [uuid.UUID(r.meta.id) for r in data] - # Preserve ordering from ElasticSearch - entities = { - entity.id: entity - for entity in Entity.objects.filter(id__in=entity_ids) - } - # Ignore entities from ES that do not exist in DB - return list(filter(None, map(entities.get, entity_ids))) - - -def search_elements_post(data): - """ - Search elements for a query - """ - elt_ids = list(map(uuid.UUID, (result.meta.id for result in data))) - if not elt_ids: - return [] - tr_ids = [ - uuid.UUID(transcription.id) - for result in data - for transcription in result.meta.inner_hits.transcriptions - ] - tr_totals = { - uuid.UUID(result.meta.id): result.meta.inner_hits.transcriptions.hits.total - for result in data - } - - transcriptions = { - t.id: t - for t in Transcription.objects.filter(id__in=tr_ids).select_related('worker_version') - } - - elts_tr_ids = { - uuid.UUID(result.meta.id): [ - uuid.UUID(transcription.id) - for transcription in result.meta.inner_hits.transcriptions - ] for result in data - } - - elts_date_range = { - uuid.UUID(result.meta.id): result.date_range - if getattr(result, 'date_range', None) else {} - for result in data - } - - elts = list(Element.objects.filter(id__in=elt_ids).prefetch_related('corpus', 'type', 'image__server')) - # Preserve the ordering given by ElasticSearch - ordered_elts = list(filter(None, map(lambda eid: next((e for e in elts if e.id == eid), None), elt_ids))) - - all_paths = Element.objects.get_ascendings_paths(*(e.id for e in ordered_elts)) - prefetch_related_objects( - [element for paths in all_paths.values() for path in paths for element in path], - 'type', - ) - - for elt in ordered_elts: - elt.transcriptions_results = list(filter(None, [transcriptions.get(tid) for tid in elts_tr_ids[elt.id]])) - elt.total_transcriptions = tr_totals[elt.id] - elt.parent_paths = all_paths.get(elt.id, []) - date_range = elts_date_range.get(elt.id, {}) - elt.date_from = date_range['gte'] if 'gte' in date_range else None - elt.date_to = date_range['lt'] if 'lt' in date_range else None - - return ordered_elts diff --git a/arkindex/documents/serializers/iiif/__init__.py b/arkindex/documents/serializers/iiif/__init__.py index 0e9a7c7c35775493724ee12ea9e96c3ebf40e9eb..ebc280205a8f49ecfeda62ba5794606f1131ff3b 100644 --- a/arkindex/documents/serializers/iiif/__init__.py +++ b/arkindex/documents/serializers/iiif/__init__.py @@ -1,5 +1,2 @@ -from arkindex.documents.serializers.iiif.annotations import ( # noqa: F401 - ElementAnnotationListSerializer, - TranscriptionSearchAnnotationListSerializer, -) +from arkindex.documents.serializers.iiif.annotations import ElementAnnotationListSerializer # noqa: F401 from arkindex.documents.serializers.iiif.manifests import FolderManifestSerializer # noqa: F401 diff --git a/arkindex/documents/serializers/iiif/annotations.py b/arkindex/documents/serializers/iiif/annotations.py index 781996ed5208ae4f159c0b66b7f9707b13ab1e6d..984aea35ffb97f72d1a68ca86b395a3d382fcae9 100644 --- a/arkindex/documents/serializers/iiif/annotations.py +++ b/arkindex/documents/serializers/iiif/annotations.py @@ -73,15 +73,6 @@ class TranscriptionAnnotationSerializer(AnnotationSerializer): } -class TranscriptionSearchAnnotationSerializer(TranscriptionAnnotationSerializer): - - def get_target(self, ts): - assert isinstance(ts, Transcription) - url = build_absolute_url(ts.element, self.context['request'], 'api:iiif-canvas') - x, y, w, h = bounding_box(ts.element.polygon) - return f'{url}#xywh={x},{y},{w},{h}' - - class AnnotationListSerializer(serializers.Serializer): """ Serialize a list of serialized annotations into a IIIF annotation list @@ -117,43 +108,3 @@ class ElementAnnotationListSerializer(AnnotationListSerializer): """ annotation_serializer = TranscriptionAnnotationSerializer - - -class SearchAnnotationListSerializer(AnnotationListSerializer): - """ - Serialize a list of serialized annotations into a search result annotation list - """ - - def to_representation(self, obj): - serialized = super().to_representation(obj) - serialized['@context'] = settings.IIIF_SEARCH_CONTEXT - serialized['within'] = { - "@type": "sc:Layer", - "total": len(serialized['resources']), - } - serialized['startIndex'] = 0 - serialized['hits'] = [ - { - "@type": "search:Hit", - "annotations": [anno['@id']], - "match": self.get_match(anno), - } - for anno in serialized['resources'] - ] - return serialized - - def get_match(self, anno): - """Get a match text for an annotation. - This is optional in the Search API but mandatory with Mirador""" - return anno['resource']['chars'] - - -class TranscriptionSearchAnnotationListSerializer(SearchAnnotationListSerializer): - """ - Serialize a transcription search result into an annotation list - """ - - annotation_serializer = TranscriptionSearchAnnotationSerializer - - def get_elements(self, obj): - return obj diff --git a/arkindex/documents/serializers/iiif/manifests.py b/arkindex/documents/serializers/iiif/manifests.py index 63a655fcb7200e09078927689d3a5a10a5e03130..778ead3849a652c345548d0d285ca129680cf466 100644 --- a/arkindex/documents/serializers/iiif/manifests.py +++ b/arkindex/documents/serializers/iiif/manifests.py @@ -134,15 +134,6 @@ class FolderManifestSerializer(serializers.Serializer): else: thumbnail = None - services = [] - if settings.ARKINDEX_FEATURES['search']: - services.append({ - "@context": settings.IIIF_SEARCH_CONTEXT, - "@id": build_absolute_url(element, self.context['request'], 'api:iiif-search'), - "profile": settings.IIIF_SEARCH_SERVICE_PROFILE, - "label": "Search transcriptions", - }) - return { "@context": settings.IIIF_PRESENTATION_CONTEXT, "@id": build_absolute_url(element, self.context['request'], self.id_url_name), @@ -162,7 +153,7 @@ class FolderManifestSerializer(serializers.Serializer): "viewingHint": "individuals", "label": element.name, "viewingDirection": "left-to-right", - "service": services, + "service": [], "metadata": ManifestMetadataSerializer( element.metadatas.exclude(type=MetaType.Markdown), context=self.context, diff --git a/arkindex/documents/serializers/search.py b/arkindex/documents/serializers/search.py index a92305ad98e7e7a58df4a4fc26d2e8d32ad5a434..67995b08b18e4b3161bcf64dd65add9ea117f207 100644 --- a/arkindex/documents/serializers/search.py +++ b/arkindex/documents/serializers/search.py @@ -1,152 +1,9 @@ -import math - -from django.conf import settings -from drf_spectacular.utils import extend_schema_field from rest_framework import serializers -from arkindex.documents.date_parser import parse_date -from arkindex.documents.models import Element, ElementType, Entity, EntityType, MetaType -from arkindex.documents.serializers.light import CorpusLightSerializer, ElementLightSerializer -from arkindex.documents.serializers.ml import TranscriptionSerializer -from arkindex.images.serializers import ZoneSerializer +from arkindex.documents.models import EntityType, MetaType from arkindex.project.serializer_fields import EnumField -class SearchQuerySerializer(serializers.Serializer): - """ - Search parameters validation serializer in order to build a ES query - - date_lte is rounded to superior value using ES syntax (see round_date()) and served as date_lt (exactly lower than) - Rounded value depend if month/day are provided (Ex: '<= march 1999' becomes '< april 1999') - """ - q = serializers.CharField(source='query', default=None, max_length=settings.ES_QUERY_STRING_MAX_LENGTH) - confidence = serializers.FloatField(source='min_confidence', min_value=0.0, max_value=1.0, default=0.0) - date_gte = serializers.CharField(default=None) - date_lte = serializers.CharField(source='date_lt', default=None) - element_type = serializers.SlugField(default=None) - corpus = serializers.UUIDField(source='corpus_id', default=None) - - def parse_date(self, raw_date): - date = parse_date(raw_date) - if len(date) != 1: - raise serializers.ValidationError('Could not parse Date') - return date[0] - - def validate_confidence(self, value): - if not isinstance(value, float) or math.isinf(value) or math.isnan(value): - raise serializers.ValidationError('Confidence should be a valid decimal number between 0 and 1') - return value - - def validate_date_gte(self, value): - if not value: - return - return self.parse_date(value).es_str() - - def validate_date_lte(self, value): - if not value: - return - return self.parse_date(value).es_round_up() - - def validate(self, data): - data = super().validate(data) - - gte, lt = data['date_gte'], data['date_lt'] - if gte and lt and gte > lt: - raise serializers.ValidationError("Upper date must be greater than lower date") - if not (data['query'] or data['date_lt'] or data['date_gte']): - raise serializers.ValidationError("At least query terms or dates must be provided") - - corpus_id, element_type = data['corpus_id'], data['element_type'] - if element_type: - if not corpus_id: - raise serializers.ValidationError('A corpus is required to filter by element types') - if not ElementType.objects.filter(corpus_id=corpus_id, slug=element_type).exists(): - raise serializers.ValidationError( - 'Type with slug {!r} does not exist in corpus {}'.format(element_type, corpus_id), - ) - - return data - - -class EntitySearchQuerySerializer(serializers.Serializer): - """ - Search parameters validation serializer in order to build a ES query for entity - """ - q = serializers.CharField(source='query', max_length=settings.ES_QUERY_STRING_MAX_LENGTH) - type = EnumField(enum=EntityType, default=None) - corpus = serializers.UUIDField(source='corpus_id', default=None) - - -class IIIFSearchQuerySerializer(serializers.Serializer): - """ - Search parameters for IIIF transcription search - See https://iiif.io/api/search/1.0/#request-1 - """ - q = serializers.CharField(source='query', max_length=settings.ES_QUERY_STRING_MAX_LENGTH) - - -class ElementSearchResultSerializer(ElementLightSerializer): - """ - A page search result with nested transcriptions - """ - zone = serializers.SerializerMethodField(read_only=True) - transcriptions = TranscriptionSerializer(many=True, source='transcriptions_results') - total_transcriptions = serializers.IntegerField() - parent_paths = serializers.ListField( - child=serializers.ListField( - child=ElementLightSerializer() - ), - ) - corpus = CorpusLightSerializer() - date_from = serializers.CharField() - date_to = serializers.CharField() - - class Meta: - model = Element - fields = ( - 'id', - 'name', - 'type', - 'zone', - 'transcriptions', - 'total_transcriptions', - 'parent_paths', - 'corpus', - 'date_from', - 'date_to', - ) - - @extend_schema_field(ZoneSerializer(read_only=True)) - def get_zone(self, element): - """ - Elements used to have zones, which held images and polygons. - Those attributes are now directly on elements. While we could just use - ZoneSerializer(source='*') in the attributes directly, this would cause - the ZoneSerializer to always be present. Instead of having zone=None, - we would see zone={'image': None, 'polygon': None, 'url': None}. - This method ensures the zone attribute is None. - """ - if not element.image_id or not element.polygon: - return - return ZoneSerializer(element).data - - -class EntitySearchResultSerializer(serializers.ModelSerializer): - """ - Serializes an entity - """ - type = EnumField(EntityType) - - class Meta: - model = Entity - fields = ( - 'id', - 'name', - 'type', - 'metas' - ) - - class SolrDocumentSerializer(serializers.Serializer): """ Serializes a Solr document diff --git a/arkindex/documents/tests/commands/test_reindex.py b/arkindex/documents/tests/commands/test_reindex.py index fd78ff99cadb470f996c0c20de0e1f059d6bd860..5a0dd55d07e4e33a764667aa5921cc69ba3afeac 100644 --- a/arkindex/documents/tests/commands/test_reindex.py +++ b/arkindex/documents/tests/commands/test_reindex.py @@ -1,275 +1,606 @@ -from unittest.mock import call, patch +from unittest.mock import patch from django.core.management import CommandError, call_command from django.test import override_settings from arkindex.dataimport.models import WorkerVersion -from arkindex.documents.models import Element, Entity, EntityType, MetaType, Transcription -from arkindex.project.elastic import ESElement, ESEntity, ESTranscription +from arkindex.documents.indexer import Indexer +from arkindex.documents.models import Corpus, EntityType, MetaType from arkindex.project.tests import FixtureTestCase +@override_settings(ARKINDEX_FEATURES={'search': True}) class TestReindexCommand(FixtureTestCase): @classmethod def setUpTestData(cls): super().setUpTestData() - cls.indexer_patch = patch('arkindex.documents.management.commands.reindex.Indexer') - cls.vol = cls.corpus.elements.get(name="Volume 1") - worker_version = WorkerVersion.objects.first() - cls.entity = cls.corpus.entities.create(type=EntityType.Misc, name='Dummy entity', worker_version=worker_version) - page = cls.corpus.elements.get(name='Volume 1, page 1r') - page.metadatas.create(name='Dummy metadata', value='Dummy', type=MetaType.Text, entity=cls.entity) - - def setUp(self): - super().setUp() - self.indexer_mock = self.indexer_patch.start() - - def tearDown(self): - super().tearDown() - self.indexer_patch.stop() - - def _assert_all_elements(self, call_args): - """ - Helper method to assert run_index is called to reindex all elements - Required because of self.assertQuerysetEqual, the only way Django has to compare querysets in tests - """ - (queryset, ), kwargs = call_args - self.assertQuerysetEqual( - queryset, - map(repr, Element.objects.all()), - ordered=False, - ) - self.assertDictEqual(kwargs, {'bulk_size': 100}) + cls.private_corpus = Corpus.objects.create(name='private', indexable=True) + cls.worker_version = WorkerVersion.objects.first() + cls.worker = cls.worker_version.worker - def _assert_all_entities(self, call_args): - """ - Helper method to assert run_index is called to reindex all entities - """ - (queryset, ), kwargs = call_args - self.assertQuerysetEqual( - queryset, - map(repr, Entity.objects.all()), - ordered=False, - ) - self.assertDictEqual(kwargs, {'bulk_size': 400}) + # Create element types + folder_type, _ = cls.private_corpus.types.get_or_create(slug='folder', display_name='Folder', folder=True) + cls.page_type, _ = cls.private_corpus.types.get_or_create(slug='page', display_name='Page', indexable=True) + cls.line_type, _ = cls.private_corpus.types.get_or_create(slug='text_line', display_name='Line') + cls.word_type, _ = cls.private_corpus.types.get_or_create(slug='word', display_name='Word') - def _assert_all_transcriptions(self, call_args): - """ - Helper method to assert run_index is called to reindex all transcriptions - """ - (queryset, ), kwargs = call_args - self.assertQuerysetEqual( - queryset, - map(repr, Transcription.objects.all()), - ordered=False, - ) - self.assertDictEqual(kwargs, {'bulk_size': 400}) + # Create elements + vol = cls.private_corpus.elements.create(name='Folder', type=folder_type) + cls.page = cls.private_corpus.elements.create(name='New page', type=cls.page_type) + cls.line = cls.private_corpus.elements.create(name='A line', type=cls.line_type) + cls.page.add_parent(vol) + cls.line.add_parent(cls.page) - def _assert_all(self): + @patch("arkindex.documents.indexer.solr") + def test_run_setup(self, mock_solr): """ - Helper method to assert run_index is called three times to reindex everything + Test the reindex setup command """ - self.assertEqual(self.indexer_mock().run_index.call_count, 3) - elements_call, entities_call, ts_call = sorted(self.indexer_mock().run_index.call_args_list, key=repr) - self._assert_all_elements(elements_call) - self._assert_all_entities(entities_call) - self._assert_all_transcriptions(ts_call) + indexer = Indexer('corpus_id') + mock_solr.collections.exists.return_value = False + mock_solr.schema.does_field_exist.return_value = False - def _assert_folder_elements(self, call_args): - """ - Helper method to assert run_index is called to reindex all elements in a folder - """ - (queryset, ), kwargs = call_args - self.assertQuerysetEqual( - queryset, - map(repr, Element.objects.get_descending(self.vol.id)), - ordered=False, - ) - self.assertDictEqual(kwargs, {'bulk_size': 100}) + call_command('reindex', self.private_corpus.id, setup=True) + self.assertEqual(mock_solr.collections.create.call_count, 1) + self.assertEqual(mock_solr.schema.create_field.call_count, len(indexer.solr_fields)) + self.assertEqual(mock_solr.schema.replace_field.call_count, 0) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 0) - def _assert_folder_entities(self, call_args): + @patch("arkindex.documents.indexer.solr") + def test_run_empty_element(self, mock_solr): """ - Helper method to assert run_index is called to reindex all entities in a folder + Test the reindex command with no indexable type """ - (queryset, ), kwargs = call_args - self.assertQuerysetEqual( - queryset, - map(repr, Entity.objects.filter( - metadatas__element__in=Element.objects.get_descending(self.vol.id)) - ), - ordered=False, - ) - self.assertDictEqual(kwargs, {'bulk_size': 400}) + self.page_type.indexable = False + self.page_type.save() - def _assert_folder_transcriptions(self, call_args): - """ - Helper method to assert run_index is called to reindex all transcriptions in a folder - """ - (queryset, ), kwargs = call_args - self.assertQuerysetEqual( - queryset, - map(repr, Transcription.objects.filter( - element__in=Element.objects.get_descending(self.vol.id), - )), - ordered=False, - ) - self.assertDictEqual(kwargs, {'bulk_size': 400}) + call_command('reindex', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 0) - def _assert_folder(self): + @patch('arkindex.documents.indexer.solr') + def test_run_multiple_elements(self, mock_solr): """ - Helper method to assert run_index is called three times to reindex a folder + Test the reindex command for multiple elements """ - self.assertEqual(self.indexer_mock().run_index.call_count, 3) - elements_call, entities_call, ts_call = sorted(self.indexer_mock().run_index.call_args_list, key=repr) - self._assert_folder_elements(elements_call) - self._assert_folder_entities(entities_call) - self._assert_folder_transcriptions(ts_call) + word = self.private_corpus.elements.create(name='A word', type=self.word_type) + word.add_parent(self.line) - def _assert_corpus_elements(self, call_args): - """ - Helper method to assert run_index is called to reindex all elements in a corpus - """ - (queryset, ), kwargs = call_args - self.assertQuerysetEqual( - queryset, - map(repr, Element.objects.filter(corpus=self.corpus)), - ordered=False, + call_command('reindex', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertCountEqual(documents, [ + { + 'id': str(self.page.id), + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'id': str(self.line.id), + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'id': str(word.id), + 'element_id': str(word.id), + 'element_text': word.name, + 'element_type': word.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }] ) - self.assertDictEqual(kwargs, {'bulk_size': 100}) + self.assertDictEqual(kwargs, {'commit': True}) - def _assert_corpus_entities(self, call_args): + @patch('arkindex.documents.indexer.solr') + def test_run_transcriptions(self, mock_solr): """ - Helper method to assert run_index is called to reindex all entities in a corpus + Test the reindex command for element with transcriptions """ - (queryset, ), kwargs = call_args - self.assertQuerysetEqual( - queryset, - map(repr, Entity.objects.filter(corpus=self.corpus)), - ordered=False, + tr_1 = self.line.transcriptions.create( + confidence=0.8, + text='Transcription for the line', + worker_version=self.worker_version, ) - self.assertDictEqual(kwargs, {'bulk_size': 400}) - - def _assert_corpus_transcriptions(self, call_args): - """ - Helper method to assert run_index is called to reindex all transcriptions in a corpus - """ - (queryset, ), kwargs = call_args - self.assertQuerysetEqual( - queryset, - map(repr, Transcription.objects.filter(element__corpus=self.corpus)), - ordered=False, + tr_2 = self.line.transcriptions.create( + confidence=0.5, + text='Second transcription', + worker_version=self.worker_version, ) - self.assertDictEqual(kwargs, {'bulk_size': 400}) - def _assert_corpus(self): - """ - Helper method to assert run_index is called three times to reindex a folder - """ - self.assertEqual(self.indexer_mock().run_index.call_count, 3) - elements_call, entities_call, ts_call = sorted(self.indexer_mock().run_index.call_args_list, key=repr) - self._assert_corpus_elements(elements_call) - self._assert_corpus_entities(entities_call) - self._assert_corpus_transcriptions(ts_call) + call_command('reindex', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + for doc in documents: + self.assertIn('id', doc) + del doc['id'] + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertCountEqual(documents, [ + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'transcription_id': str(tr_1.id), + 'transcription_confidence': tr_1.confidence, + 'transcription_text': tr_1.text, + 'transcription_worker': self.worker.name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'transcription_id': str(tr_2.id), + 'transcription_confidence': tr_2.confidence, + 'transcription_text': tr_2.text, + 'transcription_worker': self.worker.name, + }] + ) + self.assertDictEqual(kwargs, {'commit': True}) - def test_acts(self): + @patch('arkindex.documents.indexer.solr') + def test_run_classifications(self, mock_solr): """ - Test the reindex command can reindex acts + Test the reindex command for element with classifications """ - call_command( - 'reindex', - elements=True, + cl_1 = self.line.classifications.create( + confidence=0.8, + worker_version=self.worker_version, + ml_class=self.private_corpus.ml_classes.create(name='Cat') + ) + cl_2 = self.line.classifications.create( + confidence=0.4, + worker_version=self.worker_version, + ml_class=self.private_corpus.ml_classes.create(name='Dog') ) - self.assertEqual(self.indexer_mock().drop_index.call_count, 0) - self.assertEqual(self.indexer_mock().setup.call_count, 1) - self.assertEqual(self.indexer_mock().run_index.call_count, 1) - self._assert_all_elements(self.indexer_mock().run_index.call_args) - def test_entities(self): - """ - Test the reindex command can reindex entities - """ - call_command( - 'reindex', - entities=True, + call_command('reindex', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + for doc in documents: + self.assertIn('id', doc) + del doc['id'] + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertCountEqual(documents, [ + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'classification_id': str(cl_1.id), + 'classification_name': cl_1.ml_class.name, + 'classification_confidence': cl_1.confidence, + 'classification_worker': self.worker.name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'classification_id': str(cl_2.id), + 'classification_name': cl_2.ml_class.name, + 'classification_confidence': cl_2.confidence, + 'classification_worker': self.worker.name, + } + ]) + self.assertDictEqual(kwargs, {'commit': True}) + + @patch('arkindex.documents.indexer.solr') + def test_run_metadatas(self, mock_solr): + """ + Test the reindex command for element with metadatas + """ + self.private_corpus.allowed_metadatas.create(type=MetaType.Location, name='Country') + self.private_corpus.allowed_metadatas.create(type=MetaType.Text, name='Folio') + md_1 = self.line.metadatas.create( + type=MetaType.Location, + name='Country', + value='France', + worker_version=self.worker_version, + ) + md_2 = self.line.metadatas.create( + type=MetaType.Text, + name='Folio', + value='1', + worker_version=self.worker_version, ) - self.assertEqual(self.indexer_mock().drop_index.call_count, 0) - self.assertEqual(self.indexer_mock().setup.call_count, 1) - self.assertEqual(self.indexer_mock().run_index.call_count, 1) - self._assert_all_entities(self.indexer_mock().run_index.call_args) - def test_transcriptions(self): + call_command('reindex', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + for doc in documents: + self.assertIn('id', doc) + del doc['id'] + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertCountEqual(documents, [ + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'metadata_id': str(md_1.id), + 'metadata_name': md_1.name, + 'metadata_text': md_1.value, + 'metadata_type': md_1.type.value, + 'metadata_worker': self.worker.name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'metadata_id': str(md_2.id), + 'metadata_name': md_2.name, + 'metadata_text': md_2.value, + 'metadata_type': md_2.type.value, + 'metadata_worker': self.worker.name, + } + ]) + self.assertDictEqual(kwargs, {'commit': True}) + + @patch('arkindex.documents.indexer.solr') + def test_run_entities(self, mock_solr): """ - Test the reindex command can reindex transcriptions + Test the reindex command for element with entities """ - call_command( - 'reindex', - transcriptions=True, + self.private_corpus.allowed_metadatas.create(type=MetaType.Location, name='Country') + entity_1 = self.private_corpus.entities.create(name="CDLK", type=EntityType.Location, worker_version=self.worker_version) + tr = self.line.transcriptions.create( + confidence=0.8, + text='Transcription for the line', ) - self.assertEqual(self.indexer_mock().drop_index.call_count, 0) - self.assertEqual(self.indexer_mock().setup.call_count, 1) - self.assertEqual(self.indexer_mock().run_index.call_count, 1) - self._assert_all_transcriptions(self.indexer_mock().run_index.call_args) + entity_1.transcription_entities.create( + transcription=tr, + offset=0, + length=len(entity_1.name) + ) + entity_2 = self.private_corpus.entities.create(name="Robert", type=EntityType.Person, worker_version=self.worker_version) + md = self.line.metadatas.create( + type=MetaType.Location, + name='Country', + value='France', + entity=entity_2 + ) + + call_command('reindex', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + for doc in documents: + self.assertIn('id', doc) + del doc['id'] + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertCountEqual(documents, [ + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'transcription_id': str(tr.id), + 'transcription_confidence': tr.confidence, + 'transcription_text': tr.text, + 'transcription_worker': None + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'metadata_id': str(md.id), + 'metadata_name': md.name, + 'metadata_text': md.value, + 'metadata_type': md.type.value, + 'metadata_worker': None + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'entity_id': str(entity_1.id), + 'entity_text': entity_1.name, + 'entity_type': entity_1.type.value, + 'entity_worker': self.worker.name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'entity_id': str(entity_2.id), + 'entity_text': entity_2.name, + 'entity_type': entity_2.type.value, + 'entity_worker': self.worker.name, + } + ]) + self.assertDictEqual(kwargs, {'commit': True}) - def test_all(self): + @patch('arkindex.documents.indexer.solr') + def test_run_element_worker(self, mock_solr): """ - Test the reindex command reindexes everything by default + Test the reindex command for element with a worker version """ - call_command( - 'reindex', - ) - self.assertEqual(self.indexer_mock().drop_index.call_count, 0) - self.assertEqual(self.indexer_mock().setup.call_count, 1) - self._assert_all() + self.line.worker_version = self.worker_version + self.line.save() + + call_command('reindex', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + for doc in documents: + self.assertIn('id', doc) + del doc['id'] + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertCountEqual(documents, [ + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': self.worker.name, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + } + ]) + self.assertDictEqual(kwargs, {'commit': True}) - def test_folder(self): + @patch('arkindex.documents.indexer.solr') + def test_run_mutiple_parents(self, mock_solr): """ - Test the reindex command can restrict indexing to a specific folder + Test the reindex command for element with multiple paths """ - call_command( - 'reindex', - folder=self.vol, - ) - self.assertEqual(self.indexer_mock().drop_index.call_count, 0) - self.assertEqual(self.indexer_mock().setup.call_count, 1) - self._assert_folder() + new_page = self.private_corpus.elements.create(name='New page', type=self.page_type) + self.line.add_parent(new_page) + + call_command('reindex', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertCountEqual(documents, [ + { + 'id': str(self.page.id), + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'id': str(self.line.id), + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'id': str(new_page.id), + 'element_id': str(new_page.id), + 'element_text': new_page.name, + 'element_type': new_page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(new_page.id), + 'parent_name': new_page.name, + 'parent_type': new_page.type.display_name, + }, + { + 'id': str(self.line.id), + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(new_page.id), + 'parent_name': new_page.name, + 'parent_type': new_page.type.display_name, + } + ]) + self.assertDictEqual(kwargs, {'commit': True}) - def test_corpus(self): + @patch('arkindex.documents.indexer.solr') + def test_drop(self, mock_solr): """ - Test the reindex command can restrict indexing to a specific corpus + Test the reindex command can drop indexes """ - call_command( - 'reindex', - corpus=self.corpus, + self.page_type.indexable = False + self.page_type.save() + + call_command('reindex', self.private_corpus.id, drop=True) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 1) + (index_name, query), kwargs = mock_solr.delete_doc_by_query.call_args + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertEqual(query, '*:*') + self.assertDictEqual(kwargs, {'commit': True}) + + def test_corpus_not_found(self): + corpus_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' + with self.assertRaises(CommandError) as context: + call_command('reindex', corpus_id) + self.assertEqual( + str(context.exception), + f'Corpus {corpus_id} does not exist' ) - self.assertEqual(self.indexer_mock().drop_index.call_count, 0) - self.assertEqual(self.indexer_mock().setup.call_count, 1) - self._assert_corpus() - def test_drop(self): - """ - Test the reindex command can drop and recreate indexes - """ - call_command( - 'reindex', - drop=True, + def test_corpus_not_indexable(self): + self.private_corpus.indexable = False + self.private_corpus.save() + + with self.assertRaises(CommandError) as context: + call_command('reindex', self.private_corpus.id) + + self.assertEqual( + str(context.exception), + f'Corpus {self.private_corpus.name} is not indexable' ) - self.assertEqual(self.indexer_mock().drop_index.call_count, 3) - self.assertCountEqual(self.indexer_mock().drop_index.call_args_list, [ - call(ESTranscription), - call(ESElement), - call(ESEntity), - ]) - self.assertEqual(self.indexer_mock().setup.call_count, 1) - self._assert_all() @override_settings(ARKINDEX_FEATURES={'search': False}) def test_no_search(self): with self.assertRaises(CommandError) as context: - call_command('reindex') + call_command('reindex', self.private_corpus.id) self.assertEqual( str(context.exception), 'Reindexation is not possible if the search feature flag is disabled. ' - 'Consider setting `features.search` to `on` or `true` or `yes` in the YAML configuration file, ' - 'and configuring ElasticSearch properly.' + 'Consider setting `features.search` to `on` or `true` or `yes` in the YAML ' + 'configuration file, and configuring Solr properly.' ) diff --git a/arkindex/documents/tests/commands/test_reindex_v2.py b/arkindex/documents/tests/commands/test_reindex_v2.py deleted file mode 100644 index 2770ebe2530b695b81896a41ff53c5bf09361776..0000000000000000000000000000000000000000 --- a/arkindex/documents/tests/commands/test_reindex_v2.py +++ /dev/null @@ -1,606 +0,0 @@ -from unittest.mock import patch - -from django.core.management import CommandError, call_command -from django.test import override_settings - -from arkindex.dataimport.models import WorkerVersion -from arkindex.documents.indexer_v2 import Indexer -from arkindex.documents.models import Corpus, EntityType, MetaType -from arkindex.project.tests import FixtureTestCase - - -@override_settings(ARKINDEX_FEATURES={'search_v2': True}) -class TestReindexV2Command(FixtureTestCase): - - @classmethod - def setUpTestData(cls): - super().setUpTestData() - cls.private_corpus = Corpus.objects.create(name='private', indexable=True) - cls.worker_version = WorkerVersion.objects.first() - cls.worker = cls.worker_version.worker - - # Create element types - folder_type, _ = cls.private_corpus.types.get_or_create(slug='folder', display_name='Folder', folder=True) - cls.page_type, _ = cls.private_corpus.types.get_or_create(slug='page', display_name='Page', indexable=True) - cls.line_type, _ = cls.private_corpus.types.get_or_create(slug='text_line', display_name='Line') - cls.word_type, _ = cls.private_corpus.types.get_or_create(slug='word', display_name='Word') - - # Create elements - vol = cls.private_corpus.elements.create(name='Folder', type=folder_type) - cls.page = cls.private_corpus.elements.create(name='New page', type=cls.page_type) - cls.line = cls.private_corpus.elements.create(name='A line', type=cls.line_type) - cls.page.add_parent(vol) - cls.line.add_parent(cls.page) - - @patch("arkindex.documents.indexer_v2.solr") - def test_run_setup(self, mock_solr): - """ - Test the reindex setup command - """ - indexer = Indexer('corpus_id') - mock_solr.collections.exists.return_value = False - mock_solr.schema.does_field_exist.return_value = False - - call_command('reindex_v2', self.private_corpus.id, setup=True) - self.assertEqual(mock_solr.collections.create.call_count, 1) - self.assertEqual(mock_solr.schema.create_field.call_count, len(indexer.solr_fields)) - self.assertEqual(mock_solr.schema.replace_field.call_count, 0) - self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) - self.assertEqual(mock_solr.index.call_count, 0) - - @patch("arkindex.documents.indexer_v2.solr") - def test_run_empty_element(self, mock_solr): - """ - Test the reindex command with no indexable type - """ - self.page_type.indexable = False - self.page_type.save() - - call_command('reindex_v2', self.private_corpus.id) - self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) - self.assertEqual(mock_solr.index.call_count, 0) - - @patch('arkindex.documents.indexer_v2.solr') - def test_run_multiple_elements(self, mock_solr): - """ - Test the reindex command for multiple elements - """ - word = self.private_corpus.elements.create(name='A word', type=self.word_type) - word.add_parent(self.line) - - call_command('reindex_v2', self.private_corpus.id) - self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) - self.assertEqual(mock_solr.index.call_count, 1) - (index_name, documents), kwargs = mock_solr.index.call_args - self.assertEqual(index_name, f'project-{self.private_corpus.id}') - self.assertCountEqual(documents, [ - { - 'id': str(self.page.id), - 'element_id': str(self.page.id), - 'element_text': self.page.name, - 'element_type': self.page.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'id': str(self.line.id), - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'id': str(word.id), - 'element_id': str(word.id), - 'element_text': word.name, - 'element_type': word.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }] - ) - self.assertDictEqual(kwargs, {'commit': True}) - - @patch('arkindex.documents.indexer_v2.solr') - def test_run_transcriptions(self, mock_solr): - """ - Test the reindex command for element with transcriptions - """ - tr_1 = self.line.transcriptions.create( - confidence=0.8, - text='Transcription for the line', - worker_version=self.worker_version, - ) - tr_2 = self.line.transcriptions.create( - confidence=0.5, - text='Second transcription', - worker_version=self.worker_version, - ) - - call_command('reindex_v2', self.private_corpus.id) - self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) - self.assertEqual(mock_solr.index.call_count, 1) - (index_name, documents), kwargs = mock_solr.index.call_args - for doc in documents: - self.assertIn('id', doc) - del doc['id'] - self.assertEqual(index_name, f'project-{self.private_corpus.id}') - self.assertCountEqual(documents, [ - { - 'element_id': str(self.page.id), - 'element_text': self.page.name, - 'element_type': self.page.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - 'transcription_id': str(tr_1.id), - 'transcription_confidence': tr_1.confidence, - 'transcription_text': tr_1.text, - 'transcription_worker': self.worker.name, - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - 'transcription_id': str(tr_2.id), - 'transcription_confidence': tr_2.confidence, - 'transcription_text': tr_2.text, - 'transcription_worker': self.worker.name, - }] - ) - self.assertDictEqual(kwargs, {'commit': True}) - - @patch('arkindex.documents.indexer_v2.solr') - def test_run_classifications(self, mock_solr): - """ - Test the reindex command for element with classifications - """ - cl_1 = self.line.classifications.create( - confidence=0.8, - worker_version=self.worker_version, - ml_class=self.private_corpus.ml_classes.create(name='Cat') - ) - cl_2 = self.line.classifications.create( - confidence=0.4, - worker_version=self.worker_version, - ml_class=self.private_corpus.ml_classes.create(name='Dog') - ) - - call_command('reindex_v2', self.private_corpus.id) - self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) - self.assertEqual(mock_solr.index.call_count, 1) - (index_name, documents), kwargs = mock_solr.index.call_args - for doc in documents: - self.assertIn('id', doc) - del doc['id'] - self.assertEqual(index_name, f'project-{self.private_corpus.id}') - self.assertCountEqual(documents, [ - { - 'element_id': str(self.page.id), - 'element_text': self.page.name, - 'element_type': self.page.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - 'classification_id': str(cl_1.id), - 'classification_name': cl_1.ml_class.name, - 'classification_confidence': cl_1.confidence, - 'classification_worker': self.worker.name, - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - 'classification_id': str(cl_2.id), - 'classification_name': cl_2.ml_class.name, - 'classification_confidence': cl_2.confidence, - 'classification_worker': self.worker.name, - } - ]) - self.assertDictEqual(kwargs, {'commit': True}) - - @patch('arkindex.documents.indexer_v2.solr') - def test_run_metadatas(self, mock_solr): - """ - Test the reindex command for element with metadatas - """ - self.private_corpus.allowed_metadatas.create(type=MetaType.Location, name='Country') - self.private_corpus.allowed_metadatas.create(type=MetaType.Text, name='Folio') - md_1 = self.line.metadatas.create( - type=MetaType.Location, - name='Country', - value='France', - worker_version=self.worker_version, - ) - md_2 = self.line.metadatas.create( - type=MetaType.Text, - name='Folio', - value='1', - worker_version=self.worker_version, - ) - - call_command('reindex_v2', self.private_corpus.id) - self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) - self.assertEqual(mock_solr.index.call_count, 1) - (index_name, documents), kwargs = mock_solr.index.call_args - for doc in documents: - self.assertIn('id', doc) - del doc['id'] - self.assertEqual(index_name, f'project-{self.private_corpus.id}') - self.assertCountEqual(documents, [ - { - 'element_id': str(self.page.id), - 'element_text': self.page.name, - 'element_type': self.page.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - 'metadata_id': str(md_1.id), - 'metadata_name': md_1.name, - 'metadata_text': md_1.value, - 'metadata_type': md_1.type.value, - 'metadata_worker': self.worker.name, - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - 'metadata_id': str(md_2.id), - 'metadata_name': md_2.name, - 'metadata_text': md_2.value, - 'metadata_type': md_2.type.value, - 'metadata_worker': self.worker.name, - } - ]) - self.assertDictEqual(kwargs, {'commit': True}) - - @patch('arkindex.documents.indexer_v2.solr') - def test_run_entities(self, mock_solr): - """ - Test the reindex command for element with entities - """ - self.private_corpus.allowed_metadatas.create(type=MetaType.Location, name='Country') - entity_1 = self.private_corpus.entities.create(name="CDLK", type=EntityType.Location, worker_version=self.worker_version) - tr = self.line.transcriptions.create( - confidence=0.8, - text='Transcription for the line', - ) - entity_1.transcription_entities.create( - transcription=tr, - offset=0, - length=len(entity_1.name) - ) - entity_2 = self.private_corpus.entities.create(name="Robert", type=EntityType.Person, worker_version=self.worker_version) - md = self.line.metadatas.create( - type=MetaType.Location, - name='Country', - value='France', - entity=entity_2 - ) - - call_command('reindex_v2', self.private_corpus.id) - self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) - self.assertEqual(mock_solr.index.call_count, 1) - (index_name, documents), kwargs = mock_solr.index.call_args - for doc in documents: - self.assertIn('id', doc) - del doc['id'] - self.assertEqual(index_name, f'project-{self.private_corpus.id}') - self.assertCountEqual(documents, [ - { - 'element_id': str(self.page.id), - 'element_text': self.page.name, - 'element_type': self.page.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - 'transcription_id': str(tr.id), - 'transcription_confidence': tr.confidence, - 'transcription_text': tr.text, - 'transcription_worker': None - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - 'metadata_id': str(md.id), - 'metadata_name': md.name, - 'metadata_text': md.value, - 'metadata_type': md.type.value, - 'metadata_worker': None - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - 'entity_id': str(entity_1.id), - 'entity_text': entity_1.name, - 'entity_type': entity_1.type.value, - 'entity_worker': self.worker.name, - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - 'entity_id': str(entity_2.id), - 'entity_text': entity_2.name, - 'entity_type': entity_2.type.value, - 'entity_worker': self.worker.name, - } - ]) - self.assertDictEqual(kwargs, {'commit': True}) - - @patch('arkindex.documents.indexer_v2.solr') - def test_run_element_worker(self, mock_solr): - """ - Test the reindex command for element with a worker version - """ - self.line.worker_version = self.worker_version - self.line.save() - - call_command('reindex_v2', self.private_corpus.id) - self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) - self.assertEqual(mock_solr.index.call_count, 1) - (index_name, documents), kwargs = mock_solr.index.call_args - for doc in documents: - self.assertIn('id', doc) - del doc['id'] - self.assertEqual(index_name, f'project-{self.private_corpus.id}') - self.assertCountEqual(documents, [ - { - 'element_id': str(self.page.id), - 'element_text': self.page.name, - 'element_type': self.page.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': self.worker.name, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - } - ]) - self.assertDictEqual(kwargs, {'commit': True}) - - @patch('arkindex.documents.indexer_v2.solr') - def test_run_mutiple_parents(self, mock_solr): - """ - Test the reindex command for element with multiple paths - """ - new_page = self.private_corpus.elements.create(name='New page', type=self.page_type) - self.line.add_parent(new_page) - - call_command('reindex_v2', self.private_corpus.id) - self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) - self.assertEqual(mock_solr.index.call_count, 1) - (index_name, documents), kwargs = mock_solr.index.call_args - self.assertEqual(index_name, f'project-{self.private_corpus.id}') - self.assertCountEqual(documents, [ - { - 'id': str(self.page.id), - 'element_id': str(self.page.id), - 'element_text': self.page.name, - 'element_type': self.page.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'id': str(self.line.id), - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'id': str(new_page.id), - 'element_id': str(new_page.id), - 'element_text': new_page.name, - 'element_type': new_page.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(new_page.id), - 'parent_name': new_page.name, - 'parent_type': new_page.type.display_name, - }, - { - 'id': str(self.line.id), - 'element_id': str(self.line.id), - 'element_text': self.line.name, - 'element_type': self.line.type.display_name, - 'element_worker': None, - 'element_image': None, - 'parent_id': str(new_page.id), - 'parent_name': new_page.name, - 'parent_type': new_page.type.display_name, - } - ]) - self.assertDictEqual(kwargs, {'commit': True}) - - @patch('arkindex.documents.indexer_v2.solr') - def test_drop(self, mock_solr): - """ - Test the reindex command can drop indexes - """ - self.page_type.indexable = False - self.page_type.save() - - call_command('reindex_v2', self.private_corpus.id, drop=True) - self.assertEqual(mock_solr.delete_doc_by_query.call_count, 1) - (index_name, query), kwargs = mock_solr.delete_doc_by_query.call_args - self.assertEqual(index_name, f'project-{self.private_corpus.id}') - self.assertEqual(query, '*:*') - self.assertDictEqual(kwargs, {'commit': True}) - - def test_corpus_not_found(self): - corpus_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' - with self.assertRaises(CommandError) as context: - call_command('reindex_v2', corpus_id) - self.assertEqual( - str(context.exception), - f'Corpus {corpus_id} does not exist' - ) - - def test_corpus_not_indexable(self): - self.private_corpus.indexable = False - self.private_corpus.save() - - with self.assertRaises(CommandError) as context: - call_command('reindex_v2', self.private_corpus.id) - - self.assertEqual( - str(context.exception), - f'Corpus {self.private_corpus.name} is not indexable' - ) - - @override_settings(ARKINDEX_FEATURES={'search_v2': False}) - def test_no_search(self): - with self.assertRaises(CommandError) as context: - call_command('reindex_v2', self.private_corpus.id) - self.assertEqual( - str(context.exception), - 'Reindexation is not possible if the search feature flag is disabled. ' - 'Consider setting `features.search_v2` to `on` or `true` or `yes` in the YAML ' - 'configuration file, and configuring Solr properly.' - ) diff --git a/arkindex/documents/tests/test_create_transcriptions.py b/arkindex/documents/tests/test_create_transcriptions.py index acc316904ebb73906fbff32248864decc3b1deaf..63bf23516a86b53f89a5b06b8459b2ba32a39056 100644 --- a/arkindex/documents/tests/test_create_transcriptions.py +++ b/arkindex/documents/tests/test_create_transcriptions.py @@ -133,9 +133,6 @@ class TestTranscriptionCreate(FixtureAPITestCase): }) def test_create_transcription_worker_version(self): - """ - Creates a transcription with a worker version triggers its indexation on ElasticSearch - """ self.client.force_login(self.internal_user) response = self.client.post( reverse('api:transcription-create', kwargs={'pk': self.line.id}), diff --git a/arkindex/documents/tests/test_entities_api.py b/arkindex/documents/tests/test_entities_api.py index e3c4cb41ba033b5702edefc590aef52636d5ca3c..281744f748303268b6ffe42a42648056a200c2aa 100644 --- a/arkindex/documents/tests/test_entities_api.py +++ b/arkindex/documents/tests/test_entities_api.py @@ -1,10 +1,7 @@ import uuid -from unittest.mock import call, patch from django.contrib.gis.geos import LinearRing -from django.test import override_settings from django.urls import reverse -from elasticsearch.exceptions import NotFoundError from rest_framework import status from arkindex.dataimport.models import WorkerVersion @@ -897,55 +894,12 @@ class TestEntitiesAPI(FixtureAPITestCase): response = self.client.get(reverse('api:element-entities', kwargs={'pk': str(self.element.id)})) self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) - @patch('arkindex.documents.api.entities.ESEntity') - def test_delete_entity(self, entity_mock): + def test_delete_entity(self): self.client.force_login(self.user) response = self.client.delete(reverse('api:entity-details', kwargs={'pk': str(self.entity.id)})) self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) with self.assertRaises(Entity.DoesNotExist): Entity.objects.get(id=self.entity.id) - self.assertEqual(entity_mock.get.call_count, 1) - args, kwargs = entity_mock.get.call_args - self.assertTupleEqual(args, ()) - self.assertDictEqual(kwargs, {'id': self.entity.id.hex}) - self.assertEqual(entity_mock.get.return_value.delete.call_count, 1) - - @patch('arkindex.documents.api.entities.ESEntity') - def test_delete_entity_elasticsearch_error(self, entity_mock): - self.client.force_login(self.user) - entity_mock.get.side_effect = NotFoundError - response = self.client.delete(reverse('api:entity-details', kwargs={'pk': str(self.entity.id)})) - self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) - with self.assertRaises(Entity.DoesNotExist): - Entity.objects.get(id=self.entity.id) - self.assertEqual(entity_mock.get.call_count, 1) - args, kwargs = entity_mock.get.call_args - self.assertTupleEqual(args, ()) - self.assertDictEqual(kwargs, {'id': self.entity.id.hex}) - self.assertEqual(entity_mock.get.return_value.delete.call_count, 0) - - @patch('arkindex.documents.api.entities.ESEntity') - def test_delete_entity_elasticsearch_down(self, entity_mock): - """The deletion endpoint should work even when an exception - occurs during the ES index update""" - self.client.force_login(self.user) - entity_mock.get().delete.side_effect = Exception("Es server is down") - response = self.client.delete(reverse('api:entity-details', kwargs={'pk': str(self.entity.id)})) - self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) - with self.assertRaises(Entity.DoesNotExist): - Entity.objects.get(id=self.entity.id) - self.assertEqual(entity_mock.get().delete.call_count, 1) - self.assertEqual(entity_mock.get.return_value.delete.call_count, 1) - - @override_settings(ARKINDEX_FEATURES={'search': False}) - @patch('arkindex.documents.api.entities.ESEntity') - def test_delete_entity_no_search(self, entity_mock): - self.client.force_login(self.user) - response = self.client.delete(reverse('api:entity-details', kwargs={'pk': str(self.entity.id)})) - self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) - with self.assertRaises(Entity.DoesNotExist): - Entity.objects.get(id=self.entity.id) - self.assertFalse(entity_mock.get.called) def test_delete_entity_requires_login(self): response = self.client.delete(reverse('api:entity-details', kwargs={'pk': str(self.entity_bis.id)})) @@ -1074,22 +1028,3 @@ class TestEntitiesAPI(FixtureAPITestCase): with self.assertExactQueries('element_links_not_found.sql', params={'element_id': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'}): response = self.client.get(reverse('api:element-links', kwargs={'pk': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'})) self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) - - @patch('arkindex.documents.api.entities.ESEntity') - def test_entity_delete_index(self, es_entity_mock): - self.client.force_login(self.user) - response = self.client.delete(reverse('api:entity-details', kwargs={'pk': str(self.entity.id)})) - self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) - self.assertEqual(es_entity_mock.get.call_count, 1) - self.assertEqual(es_entity_mock.get.call_args, call(id=self.entity.id.hex)) - self.assertEqual(es_entity_mock.get.return_value.delete.call_count, 1) - self.assertEqual(Entity.objects.filter(id=self.entity.id).exists(), False) - - @patch('arkindex.documents.api.entities.ESEntity') - def test_entity_delete_wrong_index(self, es_entity_mock): - es_entity_mock.get.side_effect = NotFoundError() - self.client.force_login(self.user) - response = self.client.delete(reverse('api:entity-details', kwargs={'pk': str(self.entity.id)})) - self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) - self.assertEqual(es_entity_mock.get.call_count, 1) - self.assertEqual(es_entity_mock.get.return_value.delete.call_count, 0) diff --git a/arkindex/documents/tests/test_indexer.py b/arkindex/documents/tests/test_indexer.py index 494601ae6bc43744296fd31d04a0c14a111d1876..a2f21562cf61d57ca041f58e49becca30fc1e20b 100644 --- a/arkindex/documents/tests/test_indexer.py +++ b/arkindex/documents/tests/test_indexer.py @@ -1,140 +1,337 @@ -from unittest.mock import MagicMock, call, patch +import json +from hashlib import md5 +from unittest.mock import patch +from uuid import UUID -from elasticsearch import Elasticsearch -from elasticsearch.exceptions import NotFoundError +from django.db.models import CharField, Value from arkindex.dataimport.models import WorkerVersion from arkindex.documents.indexer import Indexer -from arkindex.documents.models import EntityType +from arkindex.documents.models import Corpus, EntityType, MetaType from arkindex.project.tests import FixtureTestCase -class TestIndexer(FixtureTestCase): +class TestIndexerCommand(FixtureTestCase): @classmethod def setUpTestData(cls): super().setUpTestData() - worker_version = WorkerVersion.objects.first() - for i in range(10): - cls.corpus.entities.create( - name=f'ES Dummy {i}', - type=EntityType.Misc, - worker_version=worker_version, - ) - - @patch('arkindex.documents.indexer.Elasticsearch') - def test_drop_index(self, es_mock): - document_mock = MagicMock() - document_mock.__name__ = 'ESDummy' - indexer = Indexer(['a', 'b']) - self.assertEqual(es_mock.call_count, 1) - self.assertEqual(es_mock.call_args, call(['a', 'b'])) - - indexer.drop_index(document_mock) - self.assertEqual(document_mock._index.delete.call_count, 1) - self.assertEqual(document_mock._index.delete.call_args, call()) - - @patch('arkindex.documents.indexer.Elasticsearch') - def test_drop_index_handle_exceptions(self, es_mock): - # Catches NotFoundError - document_mock = MagicMock() - document_mock.__name__ = 'ESDummy' - document_mock._index.delete.side_effect = NotFoundError - Indexer().drop_index(document_mock) - self.assertEqual(document_mock._index.delete.call_count, 1) - self.assertEqual(document_mock._index.delete.call_args, call()) - - # Raises anything else - document_mock.reset_mock() - document_mock._index.delete.side_effect = ValueError - with self.assertRaises(ValueError): - Indexer().drop_index(document_mock) - self.assertEqual(document_mock._index.delete.call_count, 1) - self.assertEqual(document_mock._index.delete.call_args, call()) - - @patch('arkindex.documents.indexer.es_bulk') - def test_run_index(self, es_bulk_mock): - es_bulk_mock.return_value = (5, []) - - queryset = self.corpus.entities.filter(name__startswith='ES Dummy') - Indexer().run_index(queryset, bulk_size=5) - - self.assertEqual(es_bulk_mock.call_count, 2) - for i, ((es_instance, actions), kwargs) in enumerate(es_bulk_mock.call_args_list): - self.assertIsInstance(es_instance, Elasticsearch) - self.assertListEqual(actions, [ - { - '_index': 'entities', - '_type': 'entity', - '_id': entity.id.hex, - '_source': { - 'corpus': self.corpus.id, - 'type': entity.type.value, - 'name': entity.name, - }, - } - for entity in queryset[i * 5:(i + 1) * 5] - ]) + cls.private_corpus = Corpus.objects.create(name='private', indexable=True) + cls.worker_version = WorkerVersion.objects.first() + cls.worker = cls.worker_version.worker + page_type, _ = cls.private_corpus.types.get_or_create(slug='page', display_name='Page', indexable=True) + cls.page = cls.private_corpus.elements.create(name='New page', type=page_type, worker_version=cls.worker_version) + + @patch('arkindex.documents.indexer.solr') + def test_setup(self, mock_solr): + mock_collections = mock_solr.collections + mock_schema = mock_solr.schema + mock_collections.exists.return_value = False + mock_schema.does_field_exist.return_value = False + + indexer = Indexer('corpus_id') + indexer.setup() + + self.assertEqual(mock_collections.exists.call_count, 1) + (index_name, ), _ = mock_collections.exists.call_args + self.assertEqual(index_name, indexer.collection_name) + + self.assertEqual(mock_collections.create.call_count, 1) + (index_name, args), _ = mock_collections.create.call_args + self.assertEqual(index_name, indexer.collection_name) + self.assertEqual(args, indexer.solr_num_shards) + + self.assertEqual(mock_solr.transport.send_request.call_count, len(indexer.solr_type_fields)) + for type_field, (_, kwargs) in zip(indexer.solr_type_fields, mock_solr.transport.send_request.call_args_list): + self.assertDictEqual(kwargs, { + 'method': 'POST', + 'endpoint': mock_solr.schema.schema_endpoint, + 'collection': indexer.collection_name, + 'data': json.dumps({'add-field-type': type_field}) + }) + + self.assertEqual(mock_schema.does_field_exist.call_count, len(indexer.solr_fields)) + self.assertEqual(mock_schema.create_field.call_count, len(indexer.solr_fields)) + for field, ((index_name, args), _) in zip(indexer.solr_fields, mock_schema.create_field.call_args_list): + self.assertEqual(index_name, indexer.collection_name) + self.assertDictEqual(args, field) + self.assertEqual(mock_schema.replace_field.call_count, 0) + + @patch('arkindex.documents.indexer.solr') + def test_already_setup(self, mock_solr): + mock_collections = mock_solr.collections + mock_schema = mock_solr.schema + mock_collections.exists.return_value = True + mock_schema.does_field_exist.return_value = True + + indexer = Indexer('corpus_id') + indexer.setup() + + self.assertEqual(mock_collections.exists.call_count, 1) + (index_name, ), _ = mock_collections.exists.call_args + self.assertEqual(index_name, indexer.collection_name) + + self.assertEqual(mock_collections.create.call_count, 0) + + self.assertEqual(mock_solr.transport.send_request.call_count, len(indexer.solr_type_fields)) + for type_field, (_, kwargs) in zip(indexer.solr_type_fields, mock_solr.transport.send_request.call_args_list): self.assertDictEqual(kwargs, { - 'chunk_size': 100, - 'initial_backoff': 2, - 'max_retries': 2, - 'request_timeout': 15, - 'stats_only': True, - '_source': False, + 'method': 'POST', + 'endpoint': mock_solr.schema.schema_endpoint, + 'collection': indexer.collection_name, + 'data': json.dumps({'add-field-type': type_field}) }) - @patch('arkindex.documents.indexer.es_bulk') - def test_index_retry(self, es_bulk_mock): - # Keep failing - es_bulk_mock.side_effect = Exception - - queryset = self.corpus.entities.filter(name__startswith='ES Dummy') - count = Indexer().index(queryset, chunk_size=10, timeout=42) - - self.assertEqual(count, 0) - self.assertEqual(es_bulk_mock.call_count, 3) - args, kwargs = zip(*es_bulk_mock.call_args_list) - - for es_instance, actions in args: - self.assertIsInstance(es_instance, Elasticsearch) - self.assertListEqual(actions, [ - { - '_index': 'entities', - '_type': 'entity', - '_id': entity.id.hex, - '_source': { - 'corpus': self.corpus.id, - 'type': entity.type.value, - 'name': entity.name, - }, - } - for entity in queryset - ]) - - self.assertTupleEqual(kwargs, ( + self.assertEqual(mock_schema.does_field_exist.call_count, len(indexer.solr_fields)) + self.assertEqual(mock_schema.replace_field.call_count, len(indexer.solr_fields)) + self.assertEqual(mock_schema.create_field.call_count, 0) + + @patch('arkindex.documents.indexer.solr') + def test_drop_index(self, mock_solr): + indexer = Indexer('corpus_id') + indexer.drop_index() + + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 1) + (index_name, query), kwargs = mock_solr.delete_doc_by_query.call_args + self.assertEqual(index_name, indexer.collection_name) + self.assertEqual(query, '*:*') + self.assertDictEqual(kwargs, {'commit': True}) + + def test_hash_worker(self): + indexer = Indexer(None) + self.assertIsNone(indexer.hash_worker(None)) + self.assertEqual(indexer.hash_worker(self.worker_version), self.worker.name) + + def test_build_id(self): + expected = UUID(md5(self.page.id.bytes + self.page.id.bytes).hexdigest()) + indexer = Indexer(None) + self.assertEqual(indexer.build_solr_id(self.page, self.page), expected) + + def test_build_element(self): + annotated_pages = self.private_corpus.elements.filter(id=self.page.id).annotate( + parent_id=Value(self.page.id, output_field=CharField()), + parent_name=Value(self.page.name, output_field=CharField()), + parent_type=Value(self.page.type.display_name, output_field=CharField()), + type_name=Value(self.page.type.display_name, output_field=CharField()) + ) + indexer = Indexer(None) + self.assertDictEqual(indexer.build_element(annotated_pages.first()), { + 'id': str(self.page.id), + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': self.worker.name, + 'element_image': None + }) + + def test_build_transcriptions(self): + tr_1 = self.page.transcriptions.create( + confidence=0.8, + text='Transcription on the page', + worker_version=self.worker_version, + ) + tr_2 = self.page.transcriptions.create( + confidence=0.5, + text='Second transcription', + worker_version=self.worker_version, + ) + indexer = Indexer(None) + self.assertListEqual(indexer.build_transcriptions(self.page, {'key': 'value'}), [ { - 'chunk_size': 10, - 'initial_backoff': 2, - 'max_retries': 2, - 'request_timeout': 42, - 'stats_only': True, - '_source': False, + 'key': 'value', + 'id': str(indexer.build_solr_id(self.page, tr_1)), + 'transcription_id': str(tr_1.id), + 'transcription_confidence': tr_1.confidence, + 'transcription_text': tr_1.text, + 'transcription_worker': self.worker.name, }, { - 'chunk_size': 5, - 'initial_backoff': 2, - 'max_retries': 2, - 'request_timeout': 84, - 'stats_only': True, - '_source': False, + 'key': 'value', + 'id': str(indexer.build_solr_id(self.page, tr_2)), + 'transcription_id': str(tr_2.id), + 'transcription_confidence': tr_2.confidence, + 'transcription_text': tr_2.text, + 'transcription_worker': self.worker.name, + } + ]) + + def test_build_classifications(self): + cl_1 = self.page.classifications.create( + confidence=0.8, + worker_version=self.worker_version, + ml_class=self.private_corpus.ml_classes.create(name='Cat') + ) + cl_2 = self.page.classifications.create( + confidence=0.4, + worker_version=self.worker_version, + ml_class=self.private_corpus.ml_classes.create(name='Dog') + ) + indexer = Indexer(None) + self.assertListEqual(indexer.build_classifications(self.page, {'key': 'value'}), [ + { + 'key': 'value', + 'id': str(indexer.build_solr_id(self.page, cl_1)), + 'classification_id': str(cl_1.id), + 'classification_name': cl_1.ml_class.name, + 'classification_confidence': cl_1.confidence, + 'classification_worker': self.worker.name, }, { - 'chunk_size': 2, - 'initial_backoff': 2, - 'max_retries': 2, - 'request_timeout': 168, - 'stats_only': True, - '_source': False, + 'key': 'value', + 'id': str(indexer.build_solr_id(self.page, cl_2)), + 'classification_id': str(cl_2.id), + 'classification_name': cl_2.ml_class.name, + 'classification_confidence': cl_2.confidence, + 'classification_worker': self.worker.name, + } + ]) + + def test_build_metadatas(self): + self.private_corpus.allowed_metadatas.create(type=MetaType.Location, name='Country') + self.private_corpus.allowed_metadatas.create(type=MetaType.Text, name='Folio') + md_1 = self.page.metadatas.create( + type=MetaType.Location, + name='Country', + value='France', + worker_version=self.worker_version, + ) + md_2 = self.page.metadatas.create( + type=MetaType.Text, + name='Folio', + value='1', + worker_version=self.worker_version, + ) + indexer = Indexer(None) + self.assertListEqual(indexer.build_metadatas(self.page, {'key': 'value'}), [ + { + 'key': 'value', + 'id': str(indexer.build_solr_id(self.page, md_1)), + 'metadata_id': str(md_1.id), + 'metadata_name': md_1.name, + 'metadata_text': md_1.value, + 'metadata_type': md_1.type.value, + 'metadata_worker': self.worker.name, + }, + { + 'key': 'value', + 'id': str(indexer.build_solr_id(self.page, md_2)), + 'metadata_id': str(md_2.id), + 'metadata_name': md_2.name, + 'metadata_text': md_2.value, + 'metadata_type': md_2.type.value, + 'metadata_worker': self.worker.name, + } + ]) + + def test_build_entities(self): + self.private_corpus.allowed_metadatas.create(type=MetaType.Location, name='Country') + entity_1 = self.private_corpus.entities.create(name="CDLK", type=EntityType.Location, worker_version=self.worker_version) + tr = self.page.transcriptions.create( + confidence=0.8, + text='Transcription on the page', + ) + entity_1.transcription_entities.create( + transcription=tr, + offset=0, + length=len(entity_1.name) + ) + entity_2 = self.private_corpus.entities.create(name="Robert", type=EntityType.Person, worker_version=self.worker_version) + self.page.metadatas.create( + type=MetaType.Location, + name='Country', + value='France', + entity=entity_2 + ) + indexer = Indexer(None) + self.assertListEqual(indexer.build_entities(self.page, {'key': 'value'}), [ + { + 'key': 'value', + 'id': str(indexer.build_solr_id(self.page, entity_1)), + 'entity_id': str(entity_1.id), + 'entity_text': entity_1.name, + 'entity_type': entity_1.type.value, + 'entity_worker': self.worker.name, }, - )) + { + 'key': 'value', + 'id': str(indexer.build_solr_id(self.page, entity_2)), + 'entity_id': str(entity_2.id), + 'entity_text': entity_2.name, + 'entity_type': entity_2.type.value, + 'entity_worker': self.worker.name, + } + ]) + + @patch('arkindex.documents.indexer.solr') + def test_index(self, mock_solr): + entity = self.private_corpus.entities.create(name="CDLK", type=EntityType.Location, worker_version=self.worker_version) + tr = self.page.transcriptions.create( + confidence=0.8, + text='Transcription on the page', + ) + entity.transcription_entities.create( + transcription=tr, + offset=0, + length=len(entity.name) + ) + + indexer = Indexer(self.private_corpus.id) + with self.assertExactQueries('indexer_prefetch.sql', params={ + 'corpus_id': self.private_corpus.id, + 'page_id': self.page.id, + 'image_id': self.page.image_id, + 'worker_version_id': self.worker_version.id, + 'worker_id': self.worker.id, + 'transcription_id': tr.id + }): + indexer.index() + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + self.assertEqual(index_name, indexer.collection_name) + self.assertListEqual(documents, [ + { + 'id': str(self.page.id), + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': self.worker.name, + 'element_image': self.page.iiif_thumbnail_url, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'id': str(indexer.build_solr_id(self.page, tr)), + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': self.worker.name, + 'element_image': self.page.iiif_thumbnail_url, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'transcription_id': str(tr.id), + 'transcription_confidence': tr.confidence, + 'transcription_text': tr.text, + 'transcription_worker': None + }, + { + 'id': str(indexer.build_solr_id(self.page, entity)), + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': self.worker.name, + 'element_image': self.page.iiif_thumbnail_url, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'entity_id': str(entity.id), + 'entity_text': entity.name, + 'entity_type': entity.type.value, + 'entity_worker': self.worker.name, + }] + ) + self.assertDictEqual(kwargs, {'commit': True}) diff --git a/arkindex/documents/tests/test_indexer_v2.py b/arkindex/documents/tests/test_indexer_v2.py deleted file mode 100644 index f5550878c20863cc47d1cb12ffd9f12f9c30971f..0000000000000000000000000000000000000000 --- a/arkindex/documents/tests/test_indexer_v2.py +++ /dev/null @@ -1,337 +0,0 @@ -import json -from hashlib import md5 -from unittest.mock import patch -from uuid import UUID - -from django.db.models import CharField, Value - -from arkindex.dataimport.models import WorkerVersion -from arkindex.documents.indexer_v2 import Indexer -from arkindex.documents.models import Corpus, EntityType, MetaType -from arkindex.project.tests import FixtureTestCase - - -class TestIndexerV2Command(FixtureTestCase): - - @classmethod - def setUpTestData(cls): - super().setUpTestData() - cls.private_corpus = Corpus.objects.create(name='private', indexable=True) - cls.worker_version = WorkerVersion.objects.first() - cls.worker = cls.worker_version.worker - page_type, _ = cls.private_corpus.types.get_or_create(slug='page', display_name='Page', indexable=True) - cls.page = cls.private_corpus.elements.create(name='New page', type=page_type, worker_version=cls.worker_version) - - @patch('arkindex.documents.indexer_v2.solr') - def test_setup(self, mock_solr): - mock_collections = mock_solr.collections - mock_schema = mock_solr.schema - mock_collections.exists.return_value = False - mock_schema.does_field_exist.return_value = False - - indexer = Indexer('corpus_id') - indexer.setup() - - self.assertEqual(mock_collections.exists.call_count, 1) - (index_name, ), _ = mock_collections.exists.call_args - self.assertEqual(index_name, indexer.collection_name) - - self.assertEqual(mock_collections.create.call_count, 1) - (index_name, args), _ = mock_collections.create.call_args - self.assertEqual(index_name, indexer.collection_name) - self.assertEqual(args, indexer.solr_num_shards) - - self.assertEqual(mock_solr.transport.send_request.call_count, len(indexer.solr_type_fields)) - for type_field, (_, kwargs) in zip(indexer.solr_type_fields, mock_solr.transport.send_request.call_args_list): - self.assertDictEqual(kwargs, { - 'method': 'POST', - 'endpoint': mock_solr.schema.schema_endpoint, - 'collection': indexer.collection_name, - 'data': json.dumps({'add-field-type': type_field}) - }) - - self.assertEqual(mock_schema.does_field_exist.call_count, len(indexer.solr_fields)) - self.assertEqual(mock_schema.create_field.call_count, len(indexer.solr_fields)) - for field, ((index_name, args), _) in zip(indexer.solr_fields, mock_schema.create_field.call_args_list): - self.assertEqual(index_name, indexer.collection_name) - self.assertDictEqual(args, field) - self.assertEqual(mock_schema.replace_field.call_count, 0) - - @patch('arkindex.documents.indexer_v2.solr') - def test_already_setup(self, mock_solr): - mock_collections = mock_solr.collections - mock_schema = mock_solr.schema - mock_collections.exists.return_value = True - mock_schema.does_field_exist.return_value = True - - indexer = Indexer('corpus_id') - indexer.setup() - - self.assertEqual(mock_collections.exists.call_count, 1) - (index_name, ), _ = mock_collections.exists.call_args - self.assertEqual(index_name, indexer.collection_name) - - self.assertEqual(mock_collections.create.call_count, 0) - - self.assertEqual(mock_solr.transport.send_request.call_count, len(indexer.solr_type_fields)) - for type_field, (_, kwargs) in zip(indexer.solr_type_fields, mock_solr.transport.send_request.call_args_list): - self.assertDictEqual(kwargs, { - 'method': 'POST', - 'endpoint': mock_solr.schema.schema_endpoint, - 'collection': indexer.collection_name, - 'data': json.dumps({'add-field-type': type_field}) - }) - - self.assertEqual(mock_schema.does_field_exist.call_count, len(indexer.solr_fields)) - self.assertEqual(mock_schema.replace_field.call_count, len(indexer.solr_fields)) - self.assertEqual(mock_schema.create_field.call_count, 0) - - @patch('arkindex.documents.indexer_v2.solr') - def test_drop_index(self, mock_solr): - indexer = Indexer('corpus_id') - indexer.drop_index() - - self.assertEqual(mock_solr.delete_doc_by_query.call_count, 1) - (index_name, query), kwargs = mock_solr.delete_doc_by_query.call_args - self.assertEqual(index_name, indexer.collection_name) - self.assertEqual(query, '*:*') - self.assertDictEqual(kwargs, {'commit': True}) - - def test_hash_worker(self): - indexer = Indexer(None) - self.assertIsNone(indexer.hash_worker(None)) - self.assertEqual(indexer.hash_worker(self.worker_version), self.worker.name) - - def test_build_id(self): - expected = UUID(md5(self.page.id.bytes + self.page.id.bytes).hexdigest()) - indexer = Indexer(None) - self.assertEqual(indexer.build_solr_id(self.page, self.page), expected) - - def test_build_element(self): - annotated_pages = self.private_corpus.elements.filter(id=self.page.id).annotate( - parent_id=Value(self.page.id, output_field=CharField()), - parent_name=Value(self.page.name, output_field=CharField()), - parent_type=Value(self.page.type.display_name, output_field=CharField()), - type_name=Value(self.page.type.display_name, output_field=CharField()) - ) - indexer = Indexer(None) - self.assertDictEqual(indexer.build_element(annotated_pages.first()), { - 'id': str(self.page.id), - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - 'element_id': str(self.page.id), - 'element_text': self.page.name, - 'element_type': self.page.type.display_name, - 'element_worker': self.worker.name, - 'element_image': None - }) - - def test_build_transcriptions(self): - tr_1 = self.page.transcriptions.create( - confidence=0.8, - text='Transcription on the page', - worker_version=self.worker_version, - ) - tr_2 = self.page.transcriptions.create( - confidence=0.5, - text='Second transcription', - worker_version=self.worker_version, - ) - indexer = Indexer(None) - self.assertListEqual(indexer.build_transcriptions(self.page, {'key': 'value'}), [ - { - 'key': 'value', - 'id': str(indexer.build_solr_id(self.page, tr_1)), - 'transcription_id': str(tr_1.id), - 'transcription_confidence': tr_1.confidence, - 'transcription_text': tr_1.text, - 'transcription_worker': self.worker.name, - }, - { - 'key': 'value', - 'id': str(indexer.build_solr_id(self.page, tr_2)), - 'transcription_id': str(tr_2.id), - 'transcription_confidence': tr_2.confidence, - 'transcription_text': tr_2.text, - 'transcription_worker': self.worker.name, - } - ]) - - def test_build_classifications(self): - cl_1 = self.page.classifications.create( - confidence=0.8, - worker_version=self.worker_version, - ml_class=self.private_corpus.ml_classes.create(name='Cat') - ) - cl_2 = self.page.classifications.create( - confidence=0.4, - worker_version=self.worker_version, - ml_class=self.private_corpus.ml_classes.create(name='Dog') - ) - indexer = Indexer(None) - self.assertListEqual(indexer.build_classifications(self.page, {'key': 'value'}), [ - { - 'key': 'value', - 'id': str(indexer.build_solr_id(self.page, cl_1)), - 'classification_id': str(cl_1.id), - 'classification_name': cl_1.ml_class.name, - 'classification_confidence': cl_1.confidence, - 'classification_worker': self.worker.name, - }, - { - 'key': 'value', - 'id': str(indexer.build_solr_id(self.page, cl_2)), - 'classification_id': str(cl_2.id), - 'classification_name': cl_2.ml_class.name, - 'classification_confidence': cl_2.confidence, - 'classification_worker': self.worker.name, - } - ]) - - def test_build_metadatas(self): - self.private_corpus.allowed_metadatas.create(type=MetaType.Location, name='Country') - self.private_corpus.allowed_metadatas.create(type=MetaType.Text, name='Folio') - md_1 = self.page.metadatas.create( - type=MetaType.Location, - name='Country', - value='France', - worker_version=self.worker_version, - ) - md_2 = self.page.metadatas.create( - type=MetaType.Text, - name='Folio', - value='1', - worker_version=self.worker_version, - ) - indexer = Indexer(None) - self.assertListEqual(indexer.build_metadatas(self.page, {'key': 'value'}), [ - { - 'key': 'value', - 'id': str(indexer.build_solr_id(self.page, md_1)), - 'metadata_id': str(md_1.id), - 'metadata_name': md_1.name, - 'metadata_text': md_1.value, - 'metadata_type': md_1.type.value, - 'metadata_worker': self.worker.name, - }, - { - 'key': 'value', - 'id': str(indexer.build_solr_id(self.page, md_2)), - 'metadata_id': str(md_2.id), - 'metadata_name': md_2.name, - 'metadata_text': md_2.value, - 'metadata_type': md_2.type.value, - 'metadata_worker': self.worker.name, - } - ]) - - def test_build_entities(self): - self.private_corpus.allowed_metadatas.create(type=MetaType.Location, name='Country') - entity_1 = self.private_corpus.entities.create(name="CDLK", type=EntityType.Location, worker_version=self.worker_version) - tr = self.page.transcriptions.create( - confidence=0.8, - text='Transcription on the page', - ) - entity_1.transcription_entities.create( - transcription=tr, - offset=0, - length=len(entity_1.name) - ) - entity_2 = self.private_corpus.entities.create(name="Robert", type=EntityType.Person, worker_version=self.worker_version) - self.page.metadatas.create( - type=MetaType.Location, - name='Country', - value='France', - entity=entity_2 - ) - indexer = Indexer(None) - self.assertListEqual(indexer.build_entities(self.page, {'key': 'value'}), [ - { - 'key': 'value', - 'id': str(indexer.build_solr_id(self.page, entity_1)), - 'entity_id': str(entity_1.id), - 'entity_text': entity_1.name, - 'entity_type': entity_1.type.value, - 'entity_worker': self.worker.name, - }, - { - 'key': 'value', - 'id': str(indexer.build_solr_id(self.page, entity_2)), - 'entity_id': str(entity_2.id), - 'entity_text': entity_2.name, - 'entity_type': entity_2.type.value, - 'entity_worker': self.worker.name, - } - ]) - - @patch('arkindex.documents.indexer_v2.solr') - def test_index(self, mock_solr): - entity = self.private_corpus.entities.create(name="CDLK", type=EntityType.Location, worker_version=self.worker_version) - tr = self.page.transcriptions.create( - confidence=0.8, - text='Transcription on the page', - ) - entity.transcription_entities.create( - transcription=tr, - offset=0, - length=len(entity.name) - ) - - indexer = Indexer(self.private_corpus.id) - with self.assertExactQueries('indexer_prefetch_v2.sql', params={ - 'corpus_id': self.private_corpus.id, - 'page_id': self.page.id, - 'image_id': self.page.image_id, - 'worker_version_id': self.worker_version.id, - 'worker_id': self.worker.id, - 'transcription_id': tr.id - }): - indexer.index() - self.assertEqual(mock_solr.index.call_count, 1) - (index_name, documents), kwargs = mock_solr.index.call_args - self.assertEqual(index_name, indexer.collection_name) - self.assertListEqual(documents, [ - { - 'id': str(self.page.id), - 'element_id': str(self.page.id), - 'element_text': self.page.name, - 'element_type': self.page.type.display_name, - 'element_worker': self.worker.name, - 'element_image': self.page.iiif_thumbnail_url, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - }, - { - 'id': str(indexer.build_solr_id(self.page, tr)), - 'element_id': str(self.page.id), - 'element_text': self.page.name, - 'element_type': self.page.type.display_name, - 'element_worker': self.worker.name, - 'element_image': self.page.iiif_thumbnail_url, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - 'transcription_id': str(tr.id), - 'transcription_confidence': tr.confidence, - 'transcription_text': tr.text, - 'transcription_worker': None - }, - { - 'id': str(indexer.build_solr_id(self.page, entity)), - 'element_id': str(self.page.id), - 'element_text': self.page.name, - 'element_type': self.page.type.display_name, - 'element_worker': self.worker.name, - 'element_image': self.page.iiif_thumbnail_url, - 'parent_id': str(self.page.id), - 'parent_name': self.page.name, - 'parent_type': self.page.type.display_name, - 'entity_id': str(entity.id), - 'entity_text': entity.name, - 'entity_type': entity.type.value, - 'entity_worker': self.worker.name, - }] - ) - self.assertDictEqual(kwargs, {'commit': True}) diff --git a/arkindex/documents/tests/test_interpreted_date.py b/arkindex/documents/tests/test_interpreted_date.py index df3b490095ea741b1d0dc6f79236b659e4bb4174..8791ecdc49450559b7b73f0cf4001d93a24b3201 100644 --- a/arkindex/documents/tests/test_interpreted_date.py +++ b/arkindex/documents/tests/test_interpreted_date.py @@ -1,7 +1,6 @@ from django.test import TestCase from arkindex.documents.date_parser import parse_date -from arkindex.documents.dates import DateType, InterpretedDate class TestInterpretedDate(TestCase): @@ -10,36 +9,3 @@ class TestInterpretedDate(TestCase): interpreted_date = parse_date('1337-may') self.assertEqual(1, len(interpreted_date)) self.assertEqual('1337-05', str(interpreted_date[0])) - - def test_to_es_range(self): - date_test_table = { - InterpretedDate( - year=1221, - type=DateType.Exact, - ): {'gte': '1221', 'lt': '1221||+1y'}, - InterpretedDate( - year=1350, - month=2, - type=DateType.Exact, - ): {'gte': '1350-02', 'lt': '1350-02||+1M'}, - InterpretedDate( - year=1323, - type=DateType.Lower, - ): {'gte': '1323'}, - InterpretedDate( - year=1212, - month=12, - day=12, - type=DateType.Lower, - ): {'gte': '1212-12-12'}, - InterpretedDate( - year=1212, - type=DateType.Upper, - ): {'lt': '1212||+1y'}, - InterpretedDate( - year=700, - type=DateType.Unknown, - ): {}, - } - for interpreted_date, expected_range in date_test_table.items(): - self.assertEqual(interpreted_date.to_es_range(), expected_range) diff --git a/arkindex/documents/tests/test_manifest.py b/arkindex/documents/tests/test_manifest.py index 44fab622b3e0fe91f7e4a8c6c0746cc4ae548b79..990002a2624ecb14cc58d2ad63329ff0125a6d6c 100644 --- a/arkindex/documents/tests/test_manifest.py +++ b/arkindex/documents/tests/test_manifest.py @@ -97,14 +97,7 @@ class TestFolderManifestSerializer(FixtureAPITestCase): {"label": "test 3", "value": "Somewhere"}, ]) - self.assertListEqual(manifest['service'], [ - { - '@context': 'http://iiif.io/api/search/0/context.json', - '@id': f'http://testserver/api/v1/iiif/{self.vol.id}/search/', - 'label': 'Search transcriptions', - 'profile': 'http://iiif.io/api/search/0/search' - } - ]) + self.assertListEqual(manifest['service'], []) def test_no_page(self): # A manifest for an empty volume diff --git a/arkindex/documents/tests/test_search.py b/arkindex/documents/tests/test_search.py deleted file mode 100644 index f997c6ec2196fc9c91ead260fccd947bf5c3a39a..0000000000000000000000000000000000000000 --- a/arkindex/documents/tests/test_search.py +++ /dev/null @@ -1,294 +0,0 @@ -import uuid -from unittest.mock import MagicMock - -from django.contrib.auth.models import AnonymousUser -from django.urls import reverse -from elasticsearch_dsl.connections import connections -from rest_framework import status - -from arkindex.dataimport.models import WorkerVersion -from arkindex.documents.models import Corpus, Element, EntityType, MetaType, Transcription -from arkindex.project.elastic import ESTranscription -from arkindex.project.tests import FixtureAPITestCase - - -class TestSearchAPI(FixtureAPITestCase): - - @classmethod - def setUpClass(cls): - super().setUpClass() - cls.previous_connection = connections.get_connection('default') - cls.es_mock = MagicMock() - connections.add_connection('default', cls.es_mock) - - @classmethod - def tearDownClass(cls): - super().tearDownClass() - connections.add_connection('default', cls.previous_connection) - - def setUp(self): - self.es_mock.reset_mock() - - def build_es_response(self, hits): - return { - "hits": { - "total": len(hits), - "max_score": None, - "hits": hits - }, - "_shards": { - "total": 5, - "failed": 0, - "skipped": 0, - "successful": 5 - }, - "took": 42, - "timed_out": False - } - - def make_transcription_hit(self, ts): - return { - "_id": str(ts.id.hex), - "_score": None, - "sort": [ - ts.confidence - ], - "_index": "transcriptions", - "_type": "transcription", - "_source": ESTranscription.from_model(ts).to_dict() - } - - def make_nested_transcription_hit(self, ts): - return { - "_source": { - "text": ts.text, - "id": str(ts.id), - "score": ts.confidence, - }, - "_score": ts.confidence, - "_nested": { - "field": "transcriptions", - "offset": 1337 - } - } - - def make_element_hit(self, elt, ts, score=1.0): - return { - "_score": score, - "_type": 'element', - "_id": str(elt.id.hex), - # TODO test date ranges in a query - '_source': {'date_range': []}, - "_index": 'elements', - "inner_hits": { - "transcriptions": { - "hits": { - "total": len(ts), - "hits": list(map(self.make_nested_transcription_hit, ts)), - "max_score": 1337, - } - } - } - } - - def make_entity_hit(self, entity): - return { - "_id": str(entity.id), - "_index": "entities", - "_type": "entity", - } - - def test_element_transcription_search(self): - elt = Element.objects.get(name="Volume 1, page 1r") - ts = Transcription.objects.filter(text="PARIS", element__image__path='img1') - - self.es_mock.count.return_value = {'count': 1} - self.es_mock.search.return_value = self.build_es_response( - [self.make_element_hit(elt, ts), ], - ) - - response = self.client.get(reverse('api:element-search'), {'q': "paris"}) - self.assertEqual(response.status_code, status.HTTP_200_OK) - - results = response.json()["results"] - self.assertEqual(len(results), 1) - result = results[0] - - self.assertEqual(result['id'], str(elt.id)) - self.assertCountEqual( - [t['id'] for t in result['transcriptions']], - map(str, ts.values_list('id', flat=True)), - ) - self.assertEqual(result['total_transcriptions'], len(ts)) - - args, kwargs = self.es_mock.search.call_args - self.assertTupleEqual(args, ()) - self.assertCountEqual(kwargs.keys(), ['body', 'index', 'doc_type']) - self.assertListEqual(kwargs['index'], ['elements']) - self.assertListEqual(kwargs['doc_type'], ['element']) - - self.assertCountEqual(kwargs['body'].keys(), ['_source', 'from', 'size', 'query', 'sort']) - self.assertListEqual(kwargs['body']['_source'], ['date_range']) - self.assertEqual(kwargs['body']['from'], 0) - self.assertEqual(kwargs['body']['size'], 1) - - self.assertCountEqual( - kwargs['body']['query']['bool']['filter'][0]['terms']['corpus'], - map(str, Corpus.objects.readable(AnonymousUser()).values_list('id', flat=True)), - ) - - nested = kwargs['body']['query']['bool']['should'][0]['nested'] - self.assertEqual(nested['score_mode'], 'sum') - self.assertEqual(nested['path'], 'transcriptions') - self.assertIn('inner_hits', nested) - - function_score = nested['query']['function_score'] - self.assertListEqual(function_score['functions'], [ - { - "field_value_factor": { - "field": "transcriptions.score", - } - } - ]) - self.assertIsInstance(function_score['query']['bool']['must'], list) - - self.assertTrue(all(len(cond.keys()) == 1 for cond in function_score['query']['bool']['must'])) - conditions = function_score['query']['bool']['must'] - self.assertCountEqual(conditions, [ - {'simple_query_string': { - 'query': 'paris', - 'fields': ['transcriptions.text'], - }}, - {'range': { - 'transcriptions.score': { - 'gte': 0.0, - } - }}, - ]) - - def test_element_reference_search(self): - elt = Element.objects.get(name="Volume 1, page 1r") - ref = elt.metadatas.create( - type=MetaType.Reference, - name='reference', - value='reference1337' - ) - self.es_mock.count.return_value = {'count': 1} - self.es_mock.search.return_value = self.build_es_response( - [self.make_element_hit(elt, []), ], - ) - response = self.client.get(reverse('api:element-search'), {'q': ref.value[2:]}) - self.assertEqual(response.status_code, status.HTTP_200_OK) - - args, kwargs = self.es_mock.search.call_args - - ref_query = kwargs['body']['query']['bool']['should'][1] - self.assertDictEqual(ref_query, { - "wildcard": { - "references": "*{}*".format(ref.value[2:]) - } - }) - - def test_iiif_transcription_search(self): - # Filter to only get transcriptions from volume 1 - unfiltered = Transcription.objects.filter(text="PARIS") - expected = Transcription.objects.filter(text="PARIS", element__image__path__in=['img1', 'img2', 'img3']) - vol = Element.objects.get(name='Volume 1') - - self.es_mock.count.return_value = {'count': len(unfiltered)} - self.es_mock.search.return_value = self.build_es_response( - list(map(self.make_transcription_hit, unfiltered)) - ) - - response = self.client.get(reverse('api:iiif-search', kwargs={'pk': str(vol.id)}), {'q': 'paris'}) - self.assertEqual(response.status_code, status.HTTP_200_OK) - data = response.json() - - self.assertEqual(data['@context'], "http://iiif.io/api/search/0/context.json") - self.assertEqual(data['@type'], 'sc:AnnotationList') - self.assertEqual(data['startIndex'], 0) - self.assertEqual(data['within']['@type'], 'sc:Layer') - self.assertEqual(data['within']['total'], len(expected)) - - hits = data['hits'] - self.assertTrue(all(hit['@type'] == 'search:Hit' for hit in hits)) - self.assertTrue(all(hit['match'] == 'PARIS' for hit in hits)) - - resources = data['resources'] - self.assertTrue(all(res['@type'] == 'oa:Annotation' for res in resources)) - self.assertTrue(all(res['motivation'] == 'sc:painting' for res in resources)) - self.assertTrue(all(res['resource']['@type'] == 'cnt:ContentAsText' for res in resources)) - self.assertTrue(all(res['resource']['format'] == 'text/plain' for res in resources)) - self.assertTrue(all(res['resource']['chars'] == 'PARIS' for res in resources)) - - args, kwargs = self.es_mock.search.call_args - self.assertTupleEqual(args, ()) - self.assertCountEqual(kwargs.keys(), ['body', 'index', 'doc_type']) - self.assertListEqual(kwargs['index'], ['transcriptions']) - self.assertListEqual(kwargs['doc_type'], ['transcription']) - - self.assertCountEqual(kwargs['body'].keys(), ['query', 'from', 'size']) - self.assertEqual(kwargs['body']['from'], 0) - self.assertEqual(kwargs['body']['size'], 10000) - self.assertIsInstance(kwargs['body']['query']['bool']['must'], list) - self.assertTrue(all(len(cond.keys()) == 1 for cond in kwargs['body']['query']['bool']['must'])) - conditions = kwargs['body']['query']['bool']['must'] - - self.assertListEqual(conditions, [{'match': {'text': 'paris'}}]) - - def test_entity_search(self): - worker_version = WorkerVersion.objects.first() - entity_1 = self.corpus.entities.create(type=EntityType.Person, name="an entity", worker_version=worker_version) - entity_2 = self.corpus.entities.create(type=EntityType.Location, name="somewhere", worker_version=worker_version) - self.es_mock.count.return_value = {'count': 2} - self.es_mock.search.return_value = self.build_es_response([ - # Test the ES ordering is preserved by returning entities in non-alphabetical order - self.make_entity_hit(entity_2), - self.make_entity_hit(entity_1), - ]) - - response = self.client.get(reverse('api:entity-search'), {'q': 'some query'}) - self.assertEqual(response.status_code, status.HTTP_200_OK) - - results = response.json()["results"] - self.assertEqual(len(results), 2) - self.assertEqual(results[0]['id'], str(entity_2.id)) - self.assertEqual(results[1]['id'], str(entity_1.id)) - - args, kwargs = self.es_mock.search.call_args - self.assertTupleEqual(args, ()) - self.assertCountEqual(kwargs.keys(), ['body', 'index', 'doc_type']) - self.assertListEqual(kwargs['index'], ['entities']) - self.assertListEqual(kwargs['doc_type'], ['entity']) - - self.assertCountEqual(kwargs['body'].keys(), ['from', 'size', 'query']) - self.assertEqual(kwargs['body']['from'], 0) - self.assertEqual(kwargs['body']['size'], 2) - - self.assertIsInstance(kwargs['body']['query']['bool']['must'], list) - self.assertTrue(all(len(cond.keys()) == 1 for cond in kwargs['body']['query']['bool']['must'])) - conditions = kwargs['body']['query']['bool']['must'] - self.assertListEqual(conditions, [ - {'simple_query_string': { - 'query': 'some query', - 'fields': ['name'], - }}, - ]) - - def test_entity_search_does_not_exist(self): - entity_id = uuid.uuid4() - self.assertFalse(self.corpus.entities.filter(id=entity_id).exists()) - - self.es_mock.count.return_value = {'count': 1} - self.es_mock.search.return_value = self.build_es_response([ - { - "_id": str(entity_id), - "_index": "entities", - "_type": "entity", - } - ]) - response = self.client.get(reverse('api:entity-search'), {'q': 'some query'}) - self.assertEqual(response.status_code, status.HTTP_200_OK) - - results = response.json()["results"] - self.assertEqual(len(results), 0) diff --git a/arkindex/documents/tests/test_search_api.py b/arkindex/documents/tests/test_search_api.py index b1fc712bb2456aa021b56a404e6c87aa564375b6..55a55717079f3c21aa3d114010e2f44750364805 100644 --- a/arkindex/documents/tests/test_search_api.py +++ b/arkindex/documents/tests/test_search_api.py @@ -1,126 +1,263 @@ from unittest.mock import patch -from django.contrib.auth.models import AnonymousUser from django.test import override_settings from django.urls import reverse from rest_framework import status +from SolrClient import SolrResponse +from SolrClient.exceptions import SolrError -from arkindex.documents.models import Corpus, Element +from arkindex.documents.models import Corpus from arkindex.project.tests import FixtureAPITestCase +@override_settings(ARKINDEX_FEATURES={'search': True}) class TestSearchApi(FixtureAPITestCase): @classmethod def setUpTestData(cls): super().setUpTestData() - cls.private_corpus = Corpus.objects.create(name='private', public=False) - - def setUp(self): - super().setUp() - self.valid_params = ( - {'q': 'a', 'confidence': '.7'}, - {'q': 'one two', 'date_lte': '1333'}, - {'q': 'one two', 'date_gte': '1333-12'}, - {'q': 'one two', 'date_gte': '1333-12-02'}, - {'q': 'one', 'corpus': str(self.corpus.id), 'element_type': 'page'}, - {'q': 'cat', 'corpus': str(self.corpus.id), 'confidence': '0.9', 'date_lte': '1333-12-02'}, - # Search by date only - {'date_gte': '1280', 'date_lte': '1290'}, - ) - self.wrong_params = ( - {'score': '0.7'}, - {'q': ' ', 'confidence': '0.7'}, - {'q': 'one', 'confidence': '1.01'}, - {'q': 'that', 'confidence': 'null'}, - {'q': 'that', 'confidence': 'nan'}, - {'q': 'that', 'confidence': 'inf'}, - {'q': 'one two', 'date_lte': '1450-'}, - {'q': 'one two', 'date_lte': '1450-02-30'}, - {'q': 'cat', 'corpus': 'not_even_an_uuid'}, - {'q': 'one', 'element_type': 'page'}, - {'q': 'two', 'date_gte': '1460', 'date_lte': '1450'}, - {'q': 'long query' * 200}, - ) - self.forbidden_params = ( - {'q': 'knowledge', 'corpus': self.private_corpus.id}, - ) - - @patch('arkindex.project.mixins.ESQuerySet') - def test_search_api(self, esqs_mock): - """ - Check if different set of client-provided parameters are - correctly handled by search api endpoint - """ - for params in self.valid_params: - response = self.client.get(reverse('api:element-search'), params) - self.assertEqual(response.status_code, status.HTTP_200_OK, response.json()) - - for params in self.wrong_params: - response = self.client.get(reverse('api:element-search'), params) - self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - - for params in self.forbidden_params: - response = self.client.get(reverse('api:element-search'), params) - self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) - - @patch('arkindex.project.mixins.ESQuerySet') - def test_element_search_no_corpora(self, esqs_mock): - """ - Test search as a user without access to any corpora returns nothing without asking ES - """ - self.corpus.public = False + cls.corpus.indexable = True + cls.corpus.save() + + def build_solr_response( + self, + docs=[], + query='', + facets={ + 'element_type': [], + 'element_worker': [], + 'transcription_worker': [], + 'classification_name': [], + 'classification_worker': [], + 'metadata_name': [], + 'metadata_type': [], + 'metadata_worker': [], + 'entity_type': [], + 'entity_worker': [] + }, + nb_results=None + ): + return SolrResponse({ + 'responseHeader': { + 'QTime': 1, + 'params': { + 'q': query + } + }, + 'response': { + 'numFound': nb_results if nb_results else len(docs), + 'start': 0, + 'numFoundExact': True, + 'docs': docs + }, + 'facet_counts': { + 'facet_fields': facets, + } + }) + + @override_settings(ARKINDEX_FEATURES={'search': False}) + def test_search_unavailable(self): + response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id})) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertEqual(response.json(), ['Search features are not available on this instance.']) + + def test_corpus_not_found(self): + response = self.client.get(reverse('api:corpus-search', kwargs={'pk': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'})) + self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) + self.assertEqual(response.json(), {'detail': 'Not found.'}) + + def test_corpus_no_permission(self): + private_corpus = Corpus.objects.create(name='private', public=False) + response = self.client.get(reverse('api:corpus-search', kwargs={'pk': private_corpus.id})) + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + def test_corpus_not_indexable(self): + self.corpus.indexable = False self.corpus.save() - self.assertFalse(Corpus.objects.readable(AnonymousUser()).exists()) - response = self.client.get(reverse('api:element-search'), {'q': 'abc'}) + response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id})) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertEqual(response.json(), [f'Corpus {self.corpus.id} is not indexable.']) + + @patch('arkindex.documents.api.search.solr') + def test_index_not_found(self, mock_solr): + mock_solr.collections.exists.return_value = False + response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id})) + self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) + self.assertEqual(response.json(), {'detail': f'Corpus index project-{self.corpus.id} not found.'}) + + @patch('arkindex.documents.api.search.solr') + def test_search_empty_query(self, mock_solr): + mock_solr.collections.exists.return_value = True + response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id})) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertEqual(response.json(), {'query': ['This field is required.']}) + + @patch('arkindex.documents.api.search.solr') + def test_search_wrong_query(self, mock_solr): + # Mock SolrClient + mock_solr.collections.exists.return_value = True + mock_solr.query.side_effect = SolrError() + + payload = { + 'sources[]': ['element'], + 'query': '*)' + } + response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id}), payload) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertEqual(response.json(), ['Query "*)" is not valid.']) + + @patch('arkindex.documents.api.search.solr') + def test_search(self, mock_solr): + collection_name = f'project-{self.corpus.id}' + possible_queries = [ + '(element_text:("I search" OR "Found") OR transcription_text:("I search" OR "Found")) AND (metadata_name:"folio" AND entity_type:"person")', + '(transcription_text:("I search" OR "Found") OR element_text:("I search" OR "Found")) AND (metadata_name:"folio" AND entity_type:"person")' + ] + docs = [{ + 'id': 'document_id', + 'parent_id': 'parent_id', + 'parent_name': '1', + 'parent_type': 'Page', + 'element_id': 'element_id', + 'element_text': '3', + 'element_type': 'Paragraph', + 'element_worker': '1234567890_A worker', + 'element_image': 'http://image.url', + 'transcription_id': 'transcription_id', + 'transcription_text': 'A text', + 'transcription_confidence': 0.2, + 'transcription_worker': '1234567890_A worker', + 'classification_id': 'classification_id', + 'classification_name': 'my class', + 'classification_confidence': 0.1, + 'classification_worker': '1234567890_A worker', + 'metadata_id': 'metadata_id', + 'metadata_name': 'A metadata', + 'metadata_text': 'My value', + 'metadata_type': 'text', + 'metadata_worker': '1234567890_A worker', + 'entity_id': 'entity_id', + 'entity_text': 'An entity', + 'entity_type': 'person', + 'entity_worker': '1234567890_A worker', + }] + + # Mock SolrClient + mock_solr.collections.exists.return_value = True + solr_response = self.build_solr_response(docs=docs, query=possible_queries[0]) + mock_solr.query.return_value = solr_response + + payload = { + 'sources[]': ['element', 'transcription'], + 'metadata_name': 'folio', + 'entity_type': 'person', + 'query': '"I search" OR "Found"', + } + response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id}), payload) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertDictEqual(response.json(), { - 'count': 0, + 'count': 1, + 'number': 1, 'next': None, 'previous': None, - 'number': 1, - 'results': [] + 'results': docs, + 'facets': solr_response.get_facets() }) - @patch('arkindex.project.mixins.ESQuerySet') - def test_entity_search_no_corpora(self, esqs_mock): - """ - Test search as a user without access to any corpora returns nothing without asking ES - """ - self.corpus.public = False - self.corpus.save() - self.assertFalse(Corpus.objects.readable(AnonymousUser()).exists()) - response = self.client.get(reverse('api:entity-search'), {'q': 'abc'}) + # Check Solr call + (index_name, ), _ = mock_solr.collections.exists.call_args + self.assertEqual(index_name, collection_name) + (index_name, args), kwargs = mock_solr.query.call_args + self.assertEqual(index_name, collection_name) + self.assertIn(args.pop('q'), possible_queries) + self.assertDictEqual(args, { + 'start': 0, + 'rows': 20, + 'facet': True, + 'facet.field': [ + 'element_type', + 'element_worker', + 'transcription_worker', + 'classification_name', + 'classification_worker', + 'metadata_name', + 'metadata_type', + 'metadata_worker', + 'entity_type', + 'entity_worker' + ] + }) + + @patch('arkindex.documents.api.search.solr') + def test_search_only_facets(self, mock_solr): + docs = [{ + 'id': 'doc_id', + 'parent_id': 'parent_id', + 'parent_name': '1', + 'parent_type': 'Page', + 'element_id': 'element_id', + 'element_text': '3', + 'element_type': 'Paragraph' + }] + facets = { + 'element_type': [ + 'Paragraph', 1, + 'Page', 0, + 'Text line', 0 + ], + 'element_worker': [], + 'transcription_worker': [], + 'classification_name': [], + 'classification_worker': [], + 'metadata_name': [], + 'metadata_type': [], + 'metadata_worker': [], + 'entity_type': [], + 'entity_worker': [] + } + + # Mock SolrClient + mock_solr.collections.exists.return_value = True + solr_response = self.build_solr_response(docs=docs, facets=facets) + mock_solr.query.return_value = solr_response + + payload = { + 'query': 'Test', + 'only_facets': True + } + response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id}), payload) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertDictEqual(response.json(), { 'count': 0, + 'number': 1, 'next': None, 'previous': None, - 'number': 1, - 'results': [] + 'results': None, + 'facets': solr_response.get_facets() }) - @override_settings(ARKINDEX_FEATURES={'search': False}) - @patch('arkindex.project.mixins.ESQuerySet') - def test_element_search_no_search(self, esqs_mock): - response = self.client.get(reverse('api:element-search'), {'q': 'abc'}) - self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - self.assertListEqual(response.json(), ['Search features are not available on this instance.']) - self.assertFalse(esqs_mock.called) + @patch('arkindex.documents.api.search.solr') + def test_search_pagination(self, mock_solr): + # Mock SolrClient + mock_solr.collections.exists.return_value = True + solr_response = self.build_solr_response(nb_results=42) + mock_solr.query.return_value = solr_response - @override_settings(ARKINDEX_FEATURES={'search': False}) - @patch('arkindex.project.mixins.ESQuerySet') - def test_entity_search_no_search(self, esqs_mock): - response = self.client.get(reverse('api:entity-search'), {'q': 'abc'}) - self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - self.assertListEqual(response.json(), ['Search features are not available on this instance.']) - self.assertFalse(esqs_mock.called) + payload = { + 'query': 'Test', + 'page': 2 + } + response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id}), payload) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertDictEqual(response.json(), { + 'count': 42, + 'number': 2, + 'next': f'http://testserver/api/v1/corpus/{self.corpus.id}/search/?page=3&query=Test', + 'previous': f'http://testserver/api/v1/corpus/{self.corpus.id}/search/?page=1&query=Test', + 'results': [], + 'facets': solr_response.get_facets() + }) - @override_settings(ARKINDEX_FEATURES={'search': False}) - @patch('arkindex.project.mixins.ESQuerySet') - def test_iiif_search_no_search(self, esqs_mock): - element = Element.objects.get(name='Volume 1') - response = self.client.get(reverse('api:iiif-search', kwargs={'pk': str(element.id)}), {'q': 'abc'}) - self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - self.assertListEqual(response.json(), ['Search features are not available on this instance.']) - self.assertFalse(esqs_mock.called) + # Check Solr call + (_, args), _ = mock_solr.query.call_args + self.assertEqual(args['start'], 20) diff --git a/arkindex/documents/tests/test_search_api_v2.py b/arkindex/documents/tests/test_search_api_v2.py deleted file mode 100644 index d068ad64b1a79616f78a5d0f1bb422df4e8ad347..0000000000000000000000000000000000000000 --- a/arkindex/documents/tests/test_search_api_v2.py +++ /dev/null @@ -1,263 +0,0 @@ -from unittest.mock import patch - -from django.test import override_settings -from django.urls import reverse -from rest_framework import status -from SolrClient import SolrResponse -from SolrClient.exceptions import SolrError - -from arkindex.documents.models import Corpus -from arkindex.project.tests import FixtureAPITestCase - - -@override_settings(ARKINDEX_FEATURES={'search_v2': True}) -class TestSearchApi(FixtureAPITestCase): - - @classmethod - def setUpTestData(cls): - super().setUpTestData() - cls.corpus.indexable = True - cls.corpus.save() - - def build_solr_response( - self, - docs=[], - query='', - facets={ - 'element_type': [], - 'element_worker': [], - 'transcription_worker': [], - 'classification_name': [], - 'classification_worker': [], - 'metadata_name': [], - 'metadata_type': [], - 'metadata_worker': [], - 'entity_type': [], - 'entity_worker': [] - }, - nb_results=None - ): - return SolrResponse({ - 'responseHeader': { - 'QTime': 1, - 'params': { - 'q': query - } - }, - 'response': { - 'numFound': nb_results if nb_results else len(docs), - 'start': 0, - 'numFoundExact': True, - 'docs': docs - }, - 'facet_counts': { - 'facet_fields': facets, - } - }) - - @override_settings(ARKINDEX_FEATURES={'search_v2': False}) - def test_search_v2_unavailable(self): - response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id})) - self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - self.assertEqual(response.json(), ['Search features are not available on this instance.']) - - def test_corpus_not_found(self): - response = self.client.get(reverse('api:corpus-search', kwargs={'pk': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'})) - self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) - self.assertEqual(response.json(), {'detail': 'Not found.'}) - - def test_corpus_no_permission(self): - private_corpus = Corpus.objects.create(name='private', public=False) - response = self.client.get(reverse('api:corpus-search', kwargs={'pk': private_corpus.id})) - self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) - - def test_corpus_not_indexable(self): - self.corpus.indexable = False - self.corpus.save() - response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id})) - self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - self.assertEqual(response.json(), [f'Corpus {self.corpus.id} is not indexable.']) - - @patch('arkindex.documents.api.search.solr') - def test_index_not_found(self, mock_solr): - mock_solr.collections.exists.return_value = False - response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id})) - self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) - self.assertEqual(response.json(), {'detail': f'Corpus index project-{self.corpus.id} not found.'}) - - @patch('arkindex.documents.api.search.solr') - def test_search_empty_query(self, mock_solr): - mock_solr.collections.exists.return_value = True - response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id})) - self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - self.assertEqual(response.json(), {'query': ['This field is required.']}) - - @patch('arkindex.documents.api.search.solr') - def test_search_wrong_query(self, mock_solr): - # Mock SolrClient - mock_solr.collections.exists.return_value = True - mock_solr.query.side_effect = SolrError() - - payload = { - 'sources[]': ['element'], - 'query': '*)' - } - response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id}), payload) - self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - self.assertEqual(response.json(), ['Query "*)" is not valid.']) - - @patch('arkindex.documents.api.search.solr') - def test_search(self, mock_solr): - collection_name = f'project-{self.corpus.id}' - possible_queries = [ - '(element_text:("I search" OR "Found") OR transcription_text:("I search" OR "Found")) AND (metadata_name:"folio" AND entity_type:"person")', - '(transcription_text:("I search" OR "Found") OR element_text:("I search" OR "Found")) AND (metadata_name:"folio" AND entity_type:"person")' - ] - docs = [{ - 'id': 'document_id', - 'parent_id': 'parent_id', - 'parent_name': '1', - 'parent_type': 'Page', - 'element_id': 'element_id', - 'element_text': '3', - 'element_type': 'Paragraph', - 'element_worker': '1234567890_A worker', - 'element_image': 'http://image.url', - 'transcription_id': 'transcription_id', - 'transcription_text': 'A text', - 'transcription_confidence': 0.2, - 'transcription_worker': '1234567890_A worker', - 'classification_id': 'classification_id', - 'classification_name': 'my class', - 'classification_confidence': 0.1, - 'classification_worker': '1234567890_A worker', - 'metadata_id': 'metadata_id', - 'metadata_name': 'A metadata', - 'metadata_text': 'My value', - 'metadata_type': 'text', - 'metadata_worker': '1234567890_A worker', - 'entity_id': 'entity_id', - 'entity_text': 'An entity', - 'entity_type': 'person', - 'entity_worker': '1234567890_A worker', - }] - - # Mock SolrClient - mock_solr.collections.exists.return_value = True - solr_response = self.build_solr_response(docs=docs, query=possible_queries[0]) - mock_solr.query.return_value = solr_response - - payload = { - 'sources[]': ['element', 'transcription'], - 'metadata_name': 'folio', - 'entity_type': 'person', - 'query': '"I search" OR "Found"', - } - response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id}), payload) - self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertDictEqual(response.json(), { - 'count': 1, - 'number': 1, - 'next': None, - 'previous': None, - 'results': docs, - 'facets': solr_response.get_facets() - }) - - # Check Solr call - (index_name, ), _ = mock_solr.collections.exists.call_args - self.assertEqual(index_name, collection_name) - (index_name, args), kwargs = mock_solr.query.call_args - self.assertEqual(index_name, collection_name) - self.assertIn(args.pop('q'), possible_queries) - self.assertDictEqual(args, { - 'start': 0, - 'rows': 20, - 'facet': True, - 'facet.field': [ - 'element_type', - 'element_worker', - 'transcription_worker', - 'classification_name', - 'classification_worker', - 'metadata_name', - 'metadata_type', - 'metadata_worker', - 'entity_type', - 'entity_worker' - ] - }) - - @patch('arkindex.documents.api.search.solr') - def test_search_only_facets(self, mock_solr): - docs = [{ - 'id': 'doc_id', - 'parent_id': 'parent_id', - 'parent_name': '1', - 'parent_type': 'Page', - 'element_id': 'element_id', - 'element_text': '3', - 'element_type': 'Paragraph' - }] - facets = { - 'element_type': [ - 'Paragraph', 1, - 'Page', 0, - 'Text line', 0 - ], - 'element_worker': [], - 'transcription_worker': [], - 'classification_name': [], - 'classification_worker': [], - 'metadata_name': [], - 'metadata_type': [], - 'metadata_worker': [], - 'entity_type': [], - 'entity_worker': [] - } - - # Mock SolrClient - mock_solr.collections.exists.return_value = True - solr_response = self.build_solr_response(docs=docs, facets=facets) - mock_solr.query.return_value = solr_response - - payload = { - 'query': 'Test', - 'only_facets': True - } - response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id}), payload) - self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertDictEqual(response.json(), { - 'count': 0, - 'number': 1, - 'next': None, - 'previous': None, - 'results': None, - 'facets': solr_response.get_facets() - }) - - @patch('arkindex.documents.api.search.solr') - def test_search_pagination(self, mock_solr): - # Mock SolrClient - mock_solr.collections.exists.return_value = True - solr_response = self.build_solr_response(nb_results=42) - mock_solr.query.return_value = solr_response - - payload = { - 'query': 'Test', - 'page': 2 - } - response = self.client.get(reverse('api:corpus-search', kwargs={'pk': self.corpus.id}), payload) - self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertDictEqual(response.json(), { - 'count': 42, - 'number': 2, - 'next': f'http://testserver/api/v1/corpus/{self.corpus.id}/search/?page=3&query=Test', - 'previous': f'http://testserver/api/v1/corpus/{self.corpus.id}/search/?page=1&query=Test', - 'results': [], - 'facets': solr_response.get_facets() - }) - - # Check Solr call - (_, args), _ = mock_solr.query.call_args - self.assertEqual(args['start'], 20) diff --git a/arkindex/project/api_v1.py b/arkindex/project/api_v1.py index 3ad1408040927c630ab8eb83b9de1910f62d57ee..33886572e1d9505f1d5d3ae76bf5b45c17d3c746 100644 --- a/arkindex/project/api_v1.py +++ b/arkindex/project/api_v1.py @@ -65,7 +65,7 @@ from arkindex.documents.api.entities import ( TranscriptionEntityCreate, ) from arkindex.documents.api.export import CorpusExportAPIView, DownloadExport -from arkindex.documents.api.iiif import ElementAnnotationList, FolderManifest, TranscriptionSearchAnnotationList +from arkindex.documents.api.iiif import ElementAnnotationList, FolderManifest from arkindex.documents.api.ml import ( ClassificationBulk, ClassificationCreate, @@ -79,7 +79,7 @@ from arkindex.documents.api.ml import ( TranscriptionCreate, TranscriptionEdit, ) -from arkindex.documents.api.search import CorpusSearch, ElementSearch, EntitySearch +from arkindex.documents.api.search import CorpusSearch from arkindex.images.api import IIIFInformationCreate, IIIFURLCreate, ImageCreate, ImageElements, ImageRetrieve from arkindex.project.openapi import OpenApiSchemaView from arkindex.users.api import ( @@ -158,7 +158,6 @@ api = [ # Manifests path('iiif/<uuid:pk>/manifest/', FolderManifest.as_view(), name='folder-manifest'), path('iiif/<uuid:pk>/list/transcriptions/', ElementAnnotationList.as_view(), name='element-annotation-list'), - path('iiif/<uuid:pk>/search/', TranscriptionSearchAnnotationList.as_view(), name='iiif-search'), # Placeholder URLs for IIIF IDs path( @@ -177,10 +176,6 @@ api = [ name='transcription-annotation', ), - # Search engines - path('elements/search/', ElementSearch.as_view(), name='element-search'), - path('entity/search/', EntitySearch.as_view(), name='entity-search'), - # Ingest transcriptions path('transcription/<uuid:pk>/', TranscriptionEdit.as_view(), name='transcription-edit'), path('transcription/bulk/', TranscriptionBulk.as_view(), name='transcription-bulk'), diff --git a/arkindex/project/config.py b/arkindex/project/config.py index 71613fbcd0814c8448286e82487c41af5676b255..3b71ae0efc104c40972cb8da7f47654ef357376b 100644 --- a/arkindex/project/config.py +++ b/arkindex/project/config.py @@ -85,9 +85,6 @@ def get_settings_parser(base_dir): static_parser.add_option('universal_viewer_url', type=str, default=None) static_parser.add_option('frontend_version', type=str, default=None) - elasticsearch_parser = parser.add_subparser('elasticsearch', default={}) - elasticsearch_parser.add_option('hosts', type=str, many=True, default=['localhost']) - solr_parser = parser.add_subparser('solr', default={}) solr_parser.add_option('api_url', type=str, default='http://localhost:8983/solr/') @@ -159,8 +156,7 @@ def get_settings_parser(base_dir): features_parser = parser.add_subparser('features', allow_extra_keys=False, default={}) features_parser.add_option('signup', type=bool, default=True) features_parser.add_option('selection', type=bool, default=True) - features_parser.add_option('search', type=bool, default=True) - features_parser.add_option('search_v2', type=bool, default=False) + features_parser.add_option('search', type=bool, default=False) features_parser.add_option('transkribus', type=bool, default=True) features_parser.add_option('workers', type=bool, default=False) diff --git a/arkindex/project/elastic.py b/arkindex/project/elastic.py deleted file mode 100644 index 942d2381625442de4a6a2a26d9f5f570850a3a27..0000000000000000000000000000000000000000 --- a/arkindex/project/elastic.py +++ /dev/null @@ -1,159 +0,0 @@ -from collections.abc import Sequence -from itertools import chain - -from django.conf import settings -from elasticsearch_dsl import Date, DateRange, Document, Float, InnerDoc, Keyword, Mapping, Nested, Search, Text -from elasticsearch_dsl.connections import connections - -connections.create_connection(hosts=settings.ELASTIC_SEARCH_HOSTS) - - -class ESQuerySet(Sequence): - - def __init__(self, func, search): - assert isinstance(search, Search) - self.func = func - self.search = search - - def __iter__(self): - # When requesting to iterate over the results without explicitly setting - # a size, ES will only return a few results, messing up the IIIF search API. - return iter(self.func(self.search[0:settings.ES_RESULTS_LIMIT].execute())) - - def __getitem__(self, value): - return self.func(self.search[value].execute()) - - def __len__(self): - return self.search.count() - - -class RawDate(Date): - """ - A Date field that just returns strings. Useful to use the ES date math - https://www.elastic.co/guide/en/elasticsearch/reference/6.2/common-options.html#date-math - """ - # TODO: Handle InterpretedDates? - - def _deserialize(self, data): - if isinstance(data, str): - if '||' in data: - data = data.partition('||')[0] - return data - return super()._deserialize(data) - - -class RawDateRange(DateRange): - _core_field = RawDate() - - -class ESTranscription(Document): - element = Keyword() - corpus = Keyword() - score = Float() - text = Text() - - class Meta: - mapping = Mapping('transcription') - - class Index: - name = 'transcriptions' - - @classmethod - def from_model(cls, instance): - return cls( - meta={'id': instance.id.hex}, - element=instance.element_id, - score=instance.confidence, - text=instance.text, - corpus=instance.element.corpus_id, - ) - - -class ESTranscriptionInnerDoc(InnerDoc): - """ - A transcription nested inside an element document. - """ - id = Keyword() - score = Float() - text = Text() - - @classmethod - def from_model(cls, instance): - return cls( - id=instance.id, - score=instance.confidence, - text=instance.text, - ) - - -class ESElement(Document): - type = Keyword() - corpus = Keyword() - # Used exclusively for sorting - parents = Keyword() - references = Keyword() - transcriptions = Nested(ESTranscriptionInnerDoc) - date_range = RawDateRange(format='yyyy||yyyy-MM||yyyy-MM-dd') - - class Meta: - mapping = Mapping('element') - - class Index: - name = 'elements' - - @classmethod - def from_model(cls, instance): - from arkindex.documents.models import Element, MetaType - - interpreted_dates = chain(*[md.get_dates() for md in instance.metadatas.all()]) - date_range = { - k: val - for date in interpreted_dates - for k, val in date.to_es_range().items() - } - if not date_range: - # Prevent inserting an infinite date range (an empty dict) - date_range = None - - return cls( - meta={'id': instance.id.hex}, - corpus=instance.corpus_id, - type=instance.type.slug, - parents=[ - element.name - for element in Element.objects.get_ascending(instance.id) - ], - # Filter using Python here as metadatas are prefetched entirely - # and applying .filter would make an unnecessary SQL query - references=[ - md.value.lower() - for md in instance.metadatas.all() - if md.type == MetaType.Reference - ], - transcriptions=list(map( - ESTranscriptionInnerDoc.from_model, - instance.transcriptions.all(), - )), - date_range=date_range, - ) - - -class ESEntity(Document): - corpus = Keyword() - type = Keyword() - name = Text() - - class Meta: - mapping = Mapping('entity') - - class Index: - name = 'entities' - - @classmethod - def from_model(cls, instance): - return cls( - meta={'id': instance.id.hex}, - corpus=instance.corpus_id, - type=instance.type.value, - name=instance.name, - ) diff --git a/arkindex/project/mixins.py b/arkindex/project/mixins.py index 560e4633ef9aff1401e941d8cff86334bd868a06..3699d8e3af653637d4d5acd6c7f41ec985486658 100644 --- a/arkindex/project/mixins.py +++ b/arkindex/project/mixins.py @@ -4,13 +4,11 @@ from django.shortcuts import get_object_or_404 from django.views.decorators.cache import cache_page from drf_spectacular.utils import extend_schema, extend_schema_view from rest_framework import status -from rest_framework.exceptions import APIException, PermissionDenied, ValidationError +from rest_framework.exceptions import APIException, PermissionDenied from rest_framework.serializers import CharField, Serializer from arkindex.dataimport.models import DataImport, DataImportMode, Repository, Worker from arkindex.documents.models import Corpus -from arkindex.documents.serializers.search import SearchQuerySerializer -from arkindex.project.elastic import ESQuerySet from arkindex.project.pagination import CustomCursorPagination from arkindex.users.models import Role from arkindex.users.utils import check_level_param, filter_rights, get_max_level @@ -189,46 +187,6 @@ class ProcessACLMixin(ACLMixin): return max(filter(None, access_levels), default=None) -class SearchAPIMixin(CorpusACLMixin): - query_serializer_class = SearchQuerySerializer - search = None - - def get_search(self, **query): - return self.search - - def get_queryset(self): - if not settings.ARKINDEX_FEATURES['search']: - raise ValidationError(['Search features are not available on this instance.']) - - serializer = self.query_serializer_class( - context={'request': self.request}, - data=self.request.query_params, - ) - serializer.is_valid(raise_exception=True) - query = serializer.validated_data - # TODO Handle corpus field in serializer too - if query.get('corpus_id'): - try: - query['corpora_ids'] = [str(self.get_corpus(query['corpus_id']).id), ] - except Corpus.DoesNotExist: - raise PermissionDenied - else: - query['corpora_ids'] = list(map( - str, - Corpus.objects.readable(self.request.user).values_list('id', flat=True), - )) - - if not query['corpora_ids']: - return [] - - query.pop('corpus_id', None) - search = self.get_search(**query) - return ESQuerySet(self.post_process, search) - - def post_process(self, *args): - return args - - class SelectionMixin(object): def get_selection(self, corpus_id=None): diff --git a/arkindex/project/settings.py b/arkindex/project/settings.py index 519b158c7b3a105a8f91faa3525bc6aae7fdbe4f..6373963ad6039bf708fab1b44f4fb82f42127d06 100644 --- a/arkindex/project/settings.py +++ b/arkindex/project/settings.py @@ -288,15 +288,6 @@ SOLR_PAGINATION_SIZE = 20 SEARCH_FILTER_MAX_TERMS = 10 -# Elastic search config -ELASTIC_SEARCH_HOSTS = conf['elasticsearch']['hosts'] -# The Scroll API is required to go over 10K results -ES_RESULTS_LIMIT = 10000 -# ES defaults to three items returned in a nested query if the inner_hits size is not defined -ES_INNER_RESULTS_LIMIT = 6 -# Maximum length for query strings—very long queries can cause timeouts -ES_QUERY_STRING_MAX_LENGTH = 1000 - # InfluxDB API root INFLUXDB_API_URL = conf['influxdb']['api_url'] @@ -306,9 +297,7 @@ SECURE_PROXY_SSL_HEADER = ('HTTP_X_FORWARDED_PROTO', 'https') # IIIF manifests IIIF_PRESENTATION_CONTEXT = "http://iiif.io/api/presentation/2/context.json" IIIF_IMAGE_CONTEXT = "http://iiif.io/api/image/2/context.json" -IIIF_SEARCH_CONTEXT = "http://iiif.io/api/search/0/context.json" IIIF_IMAGE_SERVICE_PROFILE = "http://iiif.io/api/image/2/level2.json" -IIIF_SEARCH_SERVICE_PROFILE = "http://iiif.io/api/search/0/search" # IIIF manifest download timeout # See http://docs.python-requests.org/en/master/user/advanced/#timeouts @@ -424,14 +413,6 @@ LOGGING = { 'handlers': ['console'], 'level': 'INFO', }, - 'elasticsearch': { - 'level': 'WARNING', - }, - 'elasticsearch.trace': { - 'handlers': ['console_debug'], - 'level': 'DEBUG', - 'propagate': False, - }, }, 'formatters': { 'verbose': { diff --git a/arkindex/project/tests/config_samples/defaults.yaml b/arkindex/project/tests/config_samples/defaults.yaml index ea5f88fc55e3e89c00bcc5bbf3b8772fef00e8e7..b26a2c5dd89a797c90d4fa10109da6621d0db66f 100644 --- a/arkindex/project/tests/config_samples/defaults.yaml +++ b/arkindex/project/tests/config_samples/defaults.yaml @@ -27,13 +27,9 @@ docker: doorbell: appkey: null id: null -elasticsearch: - hosts: - - localhost email: null features: - search: true - search_v2: false + search: false selection: true signup: true transkribus: true diff --git a/arkindex/project/tests/config_samples/errors.yaml b/arkindex/project/tests/config_samples/errors.yaml index d8877b7914c8b41eeeeb094fd5ae4bd67dea3b98..8e3e20657f839aa44ac5b6eea555d9709da214fc 100644 --- a/arkindex/project/tests/config_samples/errors.yaml +++ b/arkindex/project/tests/config_samples/errors.yaml @@ -18,8 +18,6 @@ database: docker: tasks_image: here: have a dict -elasticsearch: - hosts: ghosts email: host: 123 features: diff --git a/arkindex/project/tests/config_samples/override.yaml b/arkindex/project/tests/config_samples/override.yaml index be46d7c9bf4b5980c735934994f629e6c1ff9730..58361ede7da3fccc56b2b02159ff49ace4237849 100644 --- a/arkindex/project/tests/config_samples/override.yaml +++ b/arkindex/project/tests/config_samples/override.yaml @@ -35,9 +35,6 @@ docker: doorbell: appkey: doorbellappkey id: '123456' -elasticsearch: - hosts: - - google email: error_report_recipients: - noreply@nasa.gov @@ -46,8 +43,7 @@ email: port: 25 user: teklia@wanadoo.fr features: - search: false - search_v2: true + search: true selection: false signup: false transkribus: false diff --git a/arkindex/project/tests/test_elastic.py b/arkindex/project/tests/test_elastic.py deleted file mode 100644 index 1246661ad3739ea5905e56ac04c7fb494e955eba..0000000000000000000000000000000000000000 --- a/arkindex/project/tests/test_elastic.py +++ /dev/null @@ -1,65 +0,0 @@ -from unittest.mock import patch - -from arkindex.dataimport.models import WorkerVersion -from arkindex.documents.dates import DateType, InterpretedDate -from arkindex.documents.models import MetaType -from arkindex.project.elastic import ESElement -from arkindex.project.tests import FixtureAPITestCase - - -class TestESDocuments(FixtureAPITestCase): - - @patch('arkindex.documents.models.MetaData.get_dates') - def test_build_search_index_dates(self, get_dates_mock): - element = self.corpus.elements.create( - type=self.corpus.types.get(slug='act'), - name='sister act', - ) - element.metadatas.create(type=MetaType.Date, name='date', value='something') - - # Test Upper bound - get_dates_mock.return_value = [InterpretedDate(1420, 5, type=DateType.Upper), ] - date_range = ESElement.from_model(element).to_dict().get('date_range') - self.assertDictEqual(date_range, {'lt': '1420-05||+1M'}) - - # Test Lower bound - get_dates_mock.return_value = [InterpretedDate(1418, type=DateType.Lower), ] - date_range = ESElement.from_model(element).to_dict().get('date_range') - self.assertDictEqual(date_range, {'gte': '1418'}) - - # Test with both Lower and Upper bound - get_dates_mock.return_value = [ - InterpretedDate(1418, type=DateType.Lower), - InterpretedDate(1428, type=DateType.Upper), - ] - date_range = ESElement.from_model(element).to_dict().get('date_range') - self.assertDictEqual(date_range, {'gte': '1418', 'lt': '1428||+1y'}) - - # Test an exact date - get_dates_mock.return_value = [InterpretedDate(1666, 2, 3), ] - date_range = ESElement.from_model(element).to_dict().get('date_range') - self.assertDictEqual(date_range, {'gte': '1666-02-03', 'lt': '1666-02-03||+1d'}) - - def test_index_reference_metadata(self): - element = self.corpus.elements.get(name='Volume 1') - element.metadatas.create(type=MetaType.Reference, name='ref.', value='123ABC') - es_document = ESElement.from_model(element) - self.assertCountEqual(es_document.references, ['123abc']) - - def test_children_no_polygon(self): - """ - Ensure elements without images and polygons are ignored when indexing an element's children - """ - page = self.corpus.elements.get(name='Volume 1, page 1r') - self.assertTrue(page.transcriptions.exists()) - surface = self.corpus.elements.create( - type=self.corpus.types.get(slug='surface'), - name='/dev/null', - ) - surface.add_parent(page) - surface.transcriptions.create( - text='invisible transcription', - worker_version=WorkerVersion.objects.get(worker__slug='reco'), - ) - texts = [tr['text'] for tr in ESElement.from_model(page).to_dict()['transcriptions']] - self.assertNotIn('invisible transcription', texts) diff --git a/arkindex/project/tools.py b/arkindex/project/tools.py index 93f433d14fc78a29348c2d46f1cacc72f1c50f3a..319da45bd6c3320db46f2a8e70ccf962ad7725e4 100644 --- a/arkindex/project/tools.py +++ b/arkindex/project/tools.py @@ -14,22 +14,6 @@ def build_absolute_url(element, request, name, id_argument='pk', **kwargs): return request.build_absolute_uri(reverse(name, kwargs=kwargs)) -class disconnect_signal(): - """ - Context manager to temporarily disconnect a signal - """ - - def __init__(self, signal, **kwargs): - self.signal = signal - self.kwargs = kwargs - - def __enter__(self): - self.signal.disconnect(**self.kwargs) - - def __exit__(self, *args): - self.signal.connect(**self.kwargs) - - def build_tree(tree, *, corpus, type): """ Build Element and ElementPath instances from a tree described by diff --git a/arkindex/sql_validation/indexer_prefetch_v2.sql b/arkindex/sql_validation/indexer_prefetch.sql similarity index 100% rename from arkindex/sql_validation/indexer_prefetch_v2.sql rename to arkindex/sql_validation/indexer_prefetch.sql diff --git a/base/requirements.txt b/base/requirements.txt index 7db22b532aef4aeb8673fb5dc7ba255f1c9b2022..66af49cff485c3ab2a56be1c8e2a436b752f7454 100644 --- a/base/requirements.txt +++ b/base/requirements.txt @@ -1,6 +1,5 @@ boto3==1.18.13 cryptography==3.4.7 Django==3.2.6 -elasticsearch==6.8.1 lxml==4.6.3 psycopg2-binary==2.9.1 diff --git a/requirements.txt b/requirements.txt index 44cf2f53b9e7187a8be547f9b92fc5ee9bd89b35..7f283e87645d92599739456c5a5471a7526abca2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,6 @@ django-redis==5.0.0 django-rq==2.4.1 djangorestframework==3.12.4 drf-spectacular==0.17.3 -elasticsearch-dsl>=6.0.0,<7.0.0 gitpython==3.1.20 python-gitlab==2.10.0 python-memcached==1.59