diff --git a/.isort.cfg b/.isort.cfg index 6ed26f71278eecaa9b5f64d6f2bb8db47d3d15fe..c5adfd361a051c85f23fc405e516926f4c37232b 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -8,4 +8,4 @@ line_length = 120 default_section=FIRSTPARTY known_first_party = ponos,transkribus -known_third_party = bleach,boto3,botocore,corsheaders,django,django_admin_hstore_widget,django_rq,drf_spectacular,elasticsearch,elasticsearch_dsl,enumfields,gitlab,psycopg2,requests,responses,rest_framework,rq,setuptools,sqlparse,teklia_toolbox,tenacity,tripoli,yaml +known_third_party = SolrClient,bleach,boto3,botocore,corsheaders,django,django_admin_hstore_widget,django_rq,drf_spectacular,elasticsearch,elasticsearch_dsl,enumfields,gitlab,psycopg2,requests,responses,rest_framework,rq,setuptools,sqlparse,teklia_toolbox,tenacity,tripoli,yaml diff --git a/arkindex/documents/indexer_v2.py b/arkindex/documents/indexer_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..f3725b848473be6ad73cbad0197e1ddbd1195741 --- /dev/null +++ b/arkindex/documents/indexer_v2.py @@ -0,0 +1,272 @@ +import datetime +import itertools +import json +import logging + +from django.conf import settings +from django.db.models import prefetch_related_objects +from SolrClient import SolrClient +from SolrClient.exceptions import SolrError +from teklia_toolbox.time import Timer + +from arkindex.documents.models import Element + +logger = logging.getLogger(__name__) + +solr = SolrClient(settings.SOLR_API_URL) + + +class Indexer(object): + + # Chunk of element processed (can generate many more documents) + elements_chunk_size = 100 + + # Chunk of documents sent to Solr + solr_chunk_size = 200 + solr_num_shards = 1 + solr_options = {'commit': True} + solr_type_fields = [ + {'name': 'uuid', 'class': 'solr.UUIDField'} + ] + solr_fields = [ + # Parent fields + {'name': 'parent_id', 'indexed': False, 'required': True, 'type': 'uuid'}, + {'name': 'parent_name', 'indexed': False, 'required': True, 'type': 'string'}, + {'name': 'parent_type', 'indexed': False, 'required': True, 'type': 'string'}, + # Element fields + {'name': 'element_id', 'indexed': False, 'required': True, 'type': 'uuid'}, + {'name': 'element_text', 'indexed': True, 'required': True, 'type': 'string'}, + {'name': 'element_type', 'indexed': True, 'required': True, 'type': 'string'}, + {'name': 'element_worker', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'element_image', 'indexed': False, 'required': False, 'type': 'string'}, + # Transcription fields + {'name': 'transcription_text', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'transcription_confidence', 'indexed': True, 'required': False, 'type': 'pfloat'}, + {'name': 'transcription_worker', 'indexed': True, 'required': False, 'type': 'string'}, + # Classification fields + {'name': 'classification_name', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'classification_confidence', 'indexed': True, 'required': False, 'type': 'pfloat'}, + {'name': 'classification_worker', 'indexed': True, 'required': False, 'type': 'string'}, + # Metadata fields + {'name': 'metadata_text', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'metadata_type', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'metadata_worker', 'indexed': True, 'required': False, 'type': 'string'}, + # Entity fields + {'name': 'entity_text', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'entity_type', 'indexed': True, 'required': False, 'type': 'string'}, + {'name': 'entity_worker', 'indexed': True, 'required': False, 'type': 'string'} + ] + + def __init__(self, corpus_id): + self.corpus_id = corpus_id + self.collection_name = f'project-{self.corpus_id}' + + def setup(self): + """ + Create collection in Solr + """ + if not solr.collections.exists(self.collection_name): + logger.info(f'Creating collection {self.collection_name}') + solr.collections.create(self.collection_name, self.solr_num_shards) + else: + logger.info(f'Collection {self.collection_name} already exists') + + logger.info(f'Creating field types for {self.collection_name}') + # Disable SolrClient logging + logging.disable(logging.ERROR) + for type_field in self.solr_type_fields: + # SolrClient does not support field type creation + try: + solr.transport.send_request( + method='POST', + endpoint=solr.schema.schema_endpoint, + collection=self.collection_name, + data=json.dumps({'add-field-type': type_field}) + ) + except SolrError as e: + if f"Field type '{type_field['name']}' already exists." not in str(e): + raise e + # Restore logging + logging.disable(logging.NOTSET) + + logger.info(f'Creating fields for {self.collection_name}') + # Disable SolrClient logging + logging.disable(logging.INFO) + for field in self.solr_fields: + if not solr.schema.does_field_exist(self.collection_name, field['name']): + solr.schema.create_field(self.collection_name, field) + # Restore logging + logging.disable(logging.NOTSET) + + def drop_index(self): + """ + Drop an existing collection + """ + solr.delete_doc_by_query(self.collection_name, '*:*', **self.solr_options) + logger.info(f'Dropped index for {self.collection_name}') + + def get_elements(self): + """ + Use a SQL query to keep the link between parent and child across element paths (a parent is also linked to itself) + This query lists the parent and child information (id, name and type name) + The WHERE clause limits the selection to a single corpus and limits the parent elements to indexable elements + """ + + query = f""" + SELECT parent.id as parent_id, + parent.name as parent_name, + parenttype.display_name as parent_type, + element.id as id, + element.name as name, + elementtype.display_name as type_name + FROM documents_element as element + LEFT OUTER JOIN documents_elementpath as elementpath ON (element.id = elementpath.element_id) + LEFT OUTER JOIN documents_element as parent ON ( + elementpath.path @> ARRAY[parent.id] + OR element.id = parent.id + ) + INNER JOIN documents_elementtype parenttype ON (parent.type_id = parenttype.id) + INNER JOIN documents_elementtype elementtype ON (element.type_id = elementtype.id) + WHERE element.corpus_id = '{self.corpus_id}' AND parenttype.indexable + """ + return Element.objects.raw(query) + + def elements_chunk(self, elements): + element_iterator = elements.iterator() + while True: + chunk = tuple(itertools.islice(element_iterator, self.elements_chunk_size)) + if not chunk: + return + yield chunk + + def hash_worker(self, worker_version): + if not worker_version: + return + + worker = worker_version.worker + return f'{worker.id.hex[:10]}-{worker.name}' + + def build_documents(self, elements): + BUILD_METHODS = [self.build_transcriptions, self.build_classifications, self.build_metadatas, self.build_entities] + documents = [] + for element in elements: + document = self.build_element(element) + documents.append(document) + for method in BUILD_METHODS: + documents += method(element, document) + return documents + + def build_element(self, element): + return { + 'parent_id': str(element.parent_id), + 'parent_name': element.parent_name, + 'parent_type': element.parent_type, + 'element_id': str(element.id), + 'element_text': element.name, + 'element_type': element.type_name, + 'element_worker': self.hash_worker(element.worker_version), + 'element_image': element.zone.url if element.zone else None + } + + def build_transcriptions(self, element, document): + return [ + dict(document, **{ + 'transcription_text': transcription.text, + 'transcription_confidence': transcription.confidence, + 'transcription_worker': self.hash_worker(transcription.worker_version) + }) for transcription in element.transcriptions.all() + ] + + def build_classifications(self, element, document): + return [ + dict(document, **{ + 'classification_name': classification.ml_class.name, + 'classification_confidence': classification.confidence, + 'classification_worker': self.hash_worker(classification.worker_version) + }) for classification in element.classifications.all() + ] + + def build_metadatas(self, element, document): + return [ + dict(document, **{ + 'metadata_text': metadata.value, + 'metadata_type': metadata.type.value, + 'metadata_worker': self.hash_worker(metadata.worker_version) + }) for metadata in element.metadatas.all() + ] + + def build_entities(self, element, document): + entities = [entity for transcription in element.transcriptions.all() for entity in transcription.entities.all()] + entities += [metadata.entity for metadata in element.metadatas.all() if metadata.entity] + return [ + dict(document, **{ + 'entity_text': entity.name, + 'entity_type': entity.type.value, + 'entity_worker': self.hash_worker(entity.worker_version) + }) for entity in entities + ] + + def index(self): + """ + Insert items into Solr + Process elements with an indexable type and their children by chunck + For each chunk: + - load their dependencies (transcriptions, classifications...) + - serialize items in a Solr document + - send the documents to Solr + """ + elements = self.get_elements() + + total_elements, total_documents = 0, 0 + retrieve_time, build_time, index_time = datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0) + + for elements_chunk in self.elements_chunk(elements): + nb_elements = len(elements_chunk) + total_elements += nb_elements + logger.debug(f'Processing {nb_elements} elements...') + # Retrieve elements from db + with Timer() as t: + # Element + prefetch_related_objects(elements_chunk, 'zone__image__server') + prefetch_related_objects(elements_chunk, 'worker_version__worker') + # Transcriptions + prefetch_related_objects(elements_chunk, 'transcriptions') + prefetch_related_objects(elements_chunk, 'transcriptions__worker_version__worker') + # Classifications + prefetch_related_objects(elements_chunk, 'classifications') + prefetch_related_objects(elements_chunk, 'classifications__worker_version__worker') + # Metadatas + prefetch_related_objects(elements_chunk, 'metadatas') + prefetch_related_objects(elements_chunk, 'metadatas__worker_version__worker') + # Entities + prefetch_related_objects(elements_chunk, 'transcriptions__entities') + prefetch_related_objects(elements_chunk, 'transcriptions__entities__worker_version__worker') + prefetch_related_objects(elements_chunk, 'metadatas__entity') + prefetch_related_objects(elements_chunk, 'metadatas__entity__worker_version__worker') + retrieve_time += t.delta + logger.debug(f'Retrieved {nb_elements} elements') + + # Build Solr documents + with Timer() as t: + documents = self.build_documents(elements_chunk) + nb_documents = len(documents) + total_documents += nb_documents + build_time += t.delta + logger.debug(f'Built {nb_documents} Solr documents') + + # Index documents into Solr + with Timer() as t: + for i in range(0, nb_documents, self.solr_chunk_size): + solr.index(self.collection_name, documents[i:i + self.solr_chunk_size], **self.solr_options) + index_time += t.delta + logger.debug(f'Indexed {nb_documents} documents into Solr') + + result = solr.query(self.collection_name, {'q': '*:*'}) + logger.info(f'Currently {result.get_num_found()} documents in Solr') + + logger.info(f'Retrieved {total_elements} elements in {retrieve_time}') + logger.info(f'Built {total_documents} Solr documents in {build_time}') + logger.info(f'Indexed {total_documents} documents into Solr in {index_time}') + + result = solr.query(self.collection_name, {'q': '*:*'}) + logger.info(f'Currently {result.get_num_found()} documents in Solr') diff --git a/arkindex/documents/management/commands/reindex_v2.py b/arkindex/documents/management/commands/reindex_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..6d38345a09a052a8c2619435bd74f768f20320c2 --- /dev/null +++ b/arkindex/documents/management/commands/reindex_v2.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +import uuid + +from django.conf import settings +from django.core.management.base import BaseCommand, CommandError + +from arkindex.documents.indexer_v2 import Indexer +from arkindex.documents.models import Corpus + + +class Command(BaseCommand): + help = 'Reindex elements, transcriptions and classifications into Solr' + + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument( + "corpus_id", help="UUID of an existing corpus to reindex", type=uuid.UUID + ) + parser.add_argument( + '--drop', + help="Drop the existing indexes before reindexing", + action='store_true', + ) + + def handle(self, corpus_id, **options): + if not settings.ARKINDEX_FEATURES['search_v2']: + raise CommandError('Reindexation is not possible if the search feature flag is disabled. ' + 'Consider setting `features.search_v2` to `on` or `true` or `yes` in the YAML ' + 'configuration file, and configuring Sorl properly.') + + try: + corpus = Corpus.objects.get(id=corpus_id) + except Corpus.DoesNotExist: + raise CommandError(f'Corpus {corpus_id} does not exist') + + if not corpus.indexable: + raise CommandError(f'Corpus {corpus.name} is not indexable') + + indexer = Indexer(corpus.id) + indexer.setup() + if options.get('drop'): + indexer.drop_index() + indexer.index() diff --git a/arkindex/documents/migrations/0031_add_indexable_fields.py b/arkindex/documents/migrations/0031_add_indexable_fields.py new file mode 100644 index 0000000000000000000000000000000000000000..daee5457e4dce1d2e0fc34fc5fd42de7a015ea9b --- /dev/null +++ b/arkindex/documents/migrations/0031_add_indexable_fields.py @@ -0,0 +1,23 @@ +# Generated by Django 3.1.7 on 2021-04-08 09:01 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0030_convert_html_metadata_as_markdown'), + ] + + operations = [ + migrations.AddField( + model_name='corpus', + name='indexable', + field=models.BooleanField(default=False), + ), + migrations.AddField( + model_name='elementtype', + name='indexable', + field=models.BooleanField(default=False), + ), + ] diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 3fa88944d72c874ae1dea95f9504e024cbbdad7b..384355219d490a2a931bb890c35220154821fc49 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -50,6 +50,9 @@ class Corpus(IndexableModel): # Is this corpus publicly readable ? public = models.BooleanField(default=False) + # Is this corpus indexable ? + indexable = models.BooleanField(default=False) + # Specific manager for ACL objects = CorpusManager() @@ -78,6 +81,7 @@ class ElementType(models.Model): slug = models.SlugField() display_name = models.CharField(max_length=250) folder = models.BooleanField(default=False) + indexable = models.BooleanField(default=False) class Meta: constraints = [ diff --git a/arkindex/documents/tests/commands/test_reindex_v2.py b/arkindex/documents/tests/commands/test_reindex_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..e0b7e3eea5495eebfce7a9ff23e9fcd14256b278 --- /dev/null +++ b/arkindex/documents/tests/commands/test_reindex_v2.py @@ -0,0 +1,497 @@ +from unittest.mock import patch + +from django.core.management import CommandError, call_command +from django.test import override_settings + +from arkindex.dataimport.models import WorkerVersion +from arkindex.documents.models import Corpus, EntityType, MetaType +from arkindex.project.tests import FixtureTestCase + + +@override_settings(ARKINDEX_FEATURES={'search_v2': True}) +class TestReindexV2Command(FixtureTestCase): + + @classmethod + def setUpTestData(cls): + super().setUpTestData() + cls.private_corpus = Corpus.objects.create(name='private', indexable=True) + cls.worker_version = WorkerVersion.objects.first() + cls.worker = cls.worker_version.worker + + # Create element types + folder_type, _ = cls.private_corpus.types.get_or_create(slug='folder', display_name='Folder', folder=True) + cls.page_type, _ = cls.private_corpus.types.get_or_create(slug='page', display_name='Page', indexable=True) + cls.line_type, _ = cls.private_corpus.types.get_or_create(slug='text_line', display_name='Line') + cls.word_type, _ = cls.private_corpus.types.get_or_create(slug='word', display_name='Word') + + # Create elements + vol = cls.private_corpus.elements.create(name='Folder', type=folder_type) + cls.page = cls.private_corpus.elements.create(name='New page', type=cls.page_type) + cls.line = cls.private_corpus.elements.create(name='A line', type=cls.line_type) + cls.page.add_parent(vol) + cls.line.add_parent(cls.page) + + @patch("arkindex.documents.indexer_v2.solr") + def test_run_empty_element(self, mock_solr): + """ + Test the reindex command with no indexable type + """ + self.page_type.indexable = False + self.page_type.save() + + call_command('reindex_v2', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 0) + + @patch('arkindex.documents.indexer_v2.solr') + def test_run_multiple_elements(self, mock_solr): + """ + Test the reindex command for multiple elements + """ + word = self.private_corpus.elements.create(name='A word', type=self.word_type) + word.add_parent(self.line) + + call_command('reindex_v2', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertListEqual(documents, [ + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(word.id), + 'element_text': word.name, + 'element_type': word.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }] + ) + self.assertDictEqual(kwargs, {'commit': True}) + + @patch('arkindex.documents.indexer_v2.solr') + def test_run_transcriptions(self, mock_solr): + """ + Test the reindex command for element with transcriptions + """ + tr_1 = self.line.transcriptions.create( + confidence=0.8, + text='Transcription for the line', + worker_version=self.worker_version, + ) + tr_2 = self.line.transcriptions.create( + confidence=0.5, + text='Second transcription', + worker_version=self.worker_version, + ) + + call_command('reindex_v2', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertListEqual(documents, [ + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'transcription_confidence': tr_1.confidence, + 'transcription_text': tr_1.text, + 'transcription_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}' + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'transcription_confidence': tr_2.confidence, + 'transcription_text': tr_2.text, + 'transcription_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}' + }] + ) + self.assertDictEqual(kwargs, {'commit': True}) + + @patch('arkindex.documents.indexer_v2.solr') + def test_run_classifications(self, mock_solr): + """ + Test the reindex command for element with classifications + """ + cl_1 = self.line.classifications.create( + confidence=0.8, + worker_version=self.worker_version, + ml_class=self.private_corpus.ml_classes.create(name='Cat') + ) + cl_2 = self.line.classifications.create( + confidence=0.4, + worker_version=self.worker_version, + ml_class=self.private_corpus.ml_classes.create(name='Dog') + ) + + call_command('reindex_v2', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertListEqual(documents, [ + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'classification_name': cl_1.ml_class.name, + 'classification_confidence': cl_1.confidence, + 'classification_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'classification_name': cl_2.ml_class.name, + 'classification_confidence': cl_2.confidence, + 'classification_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + } + ]) + self.assertDictEqual(kwargs, {'commit': True}) + + @patch('arkindex.documents.indexer_v2.solr') + def test_run_metadatas(self, mock_solr): + """ + Test the reindex command for element with metadatas + """ + self.private_corpus.allowed_metadatas.create(type=MetaType.Location, name='Country') + self.private_corpus.allowed_metadatas.create(type=MetaType.Text, name='Folio') + md_1 = self.line.metadatas.create( + type=MetaType.Location, + name='Country', + value='France', + worker_version=self.worker_version, + ) + md_2 = self.line.metadatas.create( + type=MetaType.Text, + name='Folio', + value='1', + worker_version=self.worker_version, + ) + + call_command('reindex_v2', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertListEqual(documents, [ + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'metadata_text': md_1.value, + 'metadata_type': md_1.type.value, + 'metadata_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'metadata_text': md_2.value, + 'metadata_type': md_2.type.value, + 'metadata_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + } + ]) + self.assertDictEqual(kwargs, {'commit': True}) + + @patch('arkindex.documents.indexer_v2.solr') + def test_run_entities(self, mock_solr): + """ + Test the reindex command for element with entities + """ + self.private_corpus.allowed_metadatas.create(type=MetaType.Location, name='Country') + entity_1 = self.private_corpus.entities.create(name="CDLK", type=EntityType.Location, worker_version=self.worker_version) + tr = self.line.transcriptions.create( + confidence=0.8, + text='Transcription for the line', + ) + entity_1.transcription_entities.create( + transcription=tr, + offset=0, + length=len(entity_1.name) + ) + entity_2 = self.private_corpus.entities.create(name="Robert", type=EntityType.Person, worker_version=self.worker_version) + md = self.line.metadatas.create( + type=MetaType.Location, + name='Country', + value='France', + entity=entity_2 + ) + + call_command('reindex_v2', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertListEqual(documents, [ + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'transcription_confidence': tr.confidence, + 'transcription_text': tr.text, + 'transcription_worker': None + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'metadata_text': md.value, + 'metadata_type': md.type.value, + 'metadata_worker': None + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'entity_text': entity_1.name, + 'entity_type': entity_1.type.value, + 'entity_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'entity_text': entity_2.name, + 'entity_type': entity_2.type.value, + 'entity_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + } + ]) + self.assertDictEqual(kwargs, {'commit': True}) + + @patch('arkindex.documents.indexer_v2.solr') + def test_run_element_worker(self, mock_solr): + """ + Test the reindex command for element with a worker version + """ + self.line.worker_version = self.worker_version + self.line.save() + + call_command('reindex_v2', self.private_corpus.id) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 0) + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertListEqual(documents, [ + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': None, + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.line.id), + 'element_text': self.line.name, + 'element_type': self.line.type.display_name, + 'element_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + 'element_image': None, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + } + ]) + self.assertDictEqual(kwargs, {'commit': True}) + + @patch('arkindex.documents.indexer_v2.solr') + def test_drop(self, mock_solr): + """ + Test the reindex command can drop indexes + """ + self.page_type.indexable = False + self.page_type.save() + + call_command('reindex_v2', self.private_corpus.id, drop=True) + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 1) + (index_name, query), kwargs = mock_solr.delete_doc_by_query.call_args + self.assertEqual(index_name, f'project-{self.private_corpus.id}') + self.assertEqual(query, '*:*') + self.assertDictEqual(kwargs, {'commit': True}) + + def test_corpus_not_found(self): + corpus_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' + with self.assertRaises(CommandError) as context: + call_command('reindex_v2', corpus_id) + self.assertEqual( + str(context.exception), + f'Corpus {corpus_id} does not exist' + ) + + def test_corpus_not_indexable(self): + self.private_corpus.indexable = False + self.private_corpus.save() + + with self.assertRaises(CommandError) as context: + call_command('reindex_v2', self.private_corpus.id) + + self.assertEqual( + str(context.exception), + f'Corpus {self.private_corpus.name} is not indexable' + ) + + @override_settings(ARKINDEX_FEATURES={'search_v2': False}) + def test_no_search(self): + with self.assertRaises(CommandError) as context: + call_command('reindex_v2', self.private_corpus.id) + self.assertEqual( + str(context.exception), + 'Reindexation is not possible if the search feature flag is disabled. ' + 'Consider setting `features.search_v2` to `on` or `true` or `yes` in the YAML ' + 'configuration file, and configuring Sorl properly.' + ) diff --git a/arkindex/documents/tests/test_indexer_v2.py b/arkindex/documents/tests/test_indexer_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..1a5e0e04081773961b0e53d2f605abbfa4e41d2f --- /dev/null +++ b/arkindex/documents/tests/test_indexer_v2.py @@ -0,0 +1,310 @@ +import json +from unittest.mock import patch + +from django.db.models import CharField, Value + +from arkindex.dataimport.models import WorkerVersion +from arkindex.documents.indexer_v2 import Indexer +from arkindex.documents.models import Corpus, EntityType, MetaType +from arkindex.images.models import Zone +from arkindex.project.tests import FixtureTestCase + + +class TestIndexerV2Command(FixtureTestCase): + + @classmethod + def setUpTestData(cls): + super().setUpTestData() + cls.private_corpus = Corpus.objects.create(name='private', indexable=True) + cls.worker_version = WorkerVersion.objects.first() + cls.worker = cls.worker_version.worker + page_type, _ = cls.private_corpus.types.get_or_create(slug='page', display_name='Page', indexable=True) + cls.page = cls.private_corpus.elements.create(name='New page', type=page_type, worker_version=cls.worker_version) + + @patch('arkindex.documents.indexer_v2.solr') + def test_setup(self, mock_solr): + mock_collections = mock_solr.collections + mock_schema = mock_solr.schema + mock_collections.exists.return_value = False + mock_schema.does_field_exist.return_value = False + + indexer = Indexer('corpus_id') + indexer.setup() + + self.assertEqual(mock_collections.exists.call_count, 1) + (index_name, ), _ = mock_collections.exists.call_args + self.assertEqual(index_name, indexer.collection_name) + + self.assertEqual(mock_collections.create.call_count, 1) + (index_name, args), _ = mock_collections.create.call_args + self.assertEqual(index_name, indexer.collection_name) + self.assertEqual(args, indexer.solr_num_shards) + + self.assertEqual(mock_solr.transport.send_request.call_count, len(indexer.solr_type_fields)) + for type_field, (_, kwargs) in zip(indexer.solr_type_fields, mock_solr.transport.send_request.call_args_list): + self.assertDictEqual(kwargs, { + 'method': 'POST', + 'endpoint': mock_solr.schema.schema_endpoint, + 'collection': indexer.collection_name, + 'data': json.dumps({'add-field-type': type_field}) + }) + + self.assertEqual(mock_schema.does_field_exist.call_count, len(indexer.solr_fields)) + self.assertEqual(mock_schema.create_field.call_count, len(indexer.solr_fields)) + for field, ((index_name, args), _) in zip(indexer.solr_fields, mock_schema.create_field.call_args_list): + self.assertEqual(index_name, indexer.collection_name) + self.assertDictEqual(args, field) + + @patch('arkindex.documents.indexer_v2.solr') + def test_already_setup(self, mock_solr): + mock_collections = mock_solr.collections + mock_schema = mock_solr.schema + mock_collections.exists.return_value = True + mock_schema.does_field_exist.return_value = True + + indexer = Indexer('corpus_id') + indexer.setup() + + self.assertEqual(mock_collections.exists.call_count, 1) + (index_name, ), _ = mock_collections.exists.call_args + self.assertEqual(index_name, indexer.collection_name) + + self.assertEqual(mock_collections.create.call_count, 0) + + self.assertEqual(mock_solr.transport.send_request.call_count, len(indexer.solr_type_fields)) + for type_field, (_, kwargs) in zip(indexer.solr_type_fields, mock_solr.transport.send_request.call_args_list): + self.assertDictEqual(kwargs, { + 'method': 'POST', + 'endpoint': mock_solr.schema.schema_endpoint, + 'collection': indexer.collection_name, + 'data': json.dumps({'add-field-type': type_field}) + }) + + self.assertEqual(mock_schema.does_field_exist.call_count, len(indexer.solr_fields)) + self.assertEqual(mock_schema.create_field.call_count, 0) + + @patch('arkindex.documents.indexer_v2.solr') + def test_drop_index(self, mock_solr): + indexer = Indexer('corpus_id') + indexer.drop_index() + + self.assertEqual(mock_solr.delete_doc_by_query.call_count, 1) + (index_name, query), kwargs = mock_solr.delete_doc_by_query.call_args + self.assertEqual(index_name, indexer.collection_name) + self.assertEqual(query, '*:*') + self.assertDictEqual(kwargs, {'commit': True}) + + def test_hash_worker(self): + indexer = Indexer(None) + self.assertIsNone(indexer.hash_worker(None)) + self.assertEqual(indexer.hash_worker(self.worker_version), f'{self.worker.id.hex[:10]}-{self.worker.name}') + + def test_build_element(self): + annotated_pages = self.private_corpus.elements.filter(id=self.page.id).annotate( + parent_id=Value(self.page.id, output_field=CharField()), + parent_name=Value(self.page.name, output_field=CharField()), + parent_type=Value(self.page.type.display_name, output_field=CharField()), + type_name=Value(self.page.type.display_name, output_field=CharField()) + ) + indexer = Indexer(None) + self.assertDictEqual(indexer.build_element(annotated_pages.first()), { + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + 'element_image': None + }) + + def test_build_transcriptions(self): + tr_1 = self.page.transcriptions.create( + confidence=0.8, + text='Transcription on the page', + worker_version=self.worker_version, + ) + tr_2 = self.page.transcriptions.create( + confidence=0.5, + text='Second transcription', + worker_version=self.worker_version, + ) + indexer = Indexer(None) + self.assertListEqual(indexer.build_transcriptions(self.page, {'key': 'value'}), [ + { + 'key': 'value', + 'transcription_confidence': tr_1.confidence, + 'transcription_text': tr_1.text, + 'transcription_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}' + }, + { + 'key': 'value', + 'transcription_confidence': tr_2.confidence, + 'transcription_text': tr_2.text, + 'transcription_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}' + } + ]) + + def test_build_classifications(self): + cl_1 = self.page.classifications.create( + confidence=0.8, + worker_version=self.worker_version, + ml_class=self.private_corpus.ml_classes.create(name='Cat') + ) + cl_2 = self.page.classifications.create( + confidence=0.4, + worker_version=self.worker_version, + ml_class=self.private_corpus.ml_classes.create(name='Dog') + ) + indexer = Indexer(None) + self.assertListEqual(indexer.build_classifications(self.page, {'key': 'value'}), [ + { + 'key': 'value', + 'classification_name': cl_1.ml_class.name, + 'classification_confidence': cl_1.confidence, + 'classification_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + }, + { + 'key': 'value', + 'classification_name': cl_2.ml_class.name, + 'classification_confidence': cl_2.confidence, + 'classification_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + } + ]) + + def test_build_metadatas(self): + self.private_corpus.allowed_metadatas.create(type=MetaType.Location, name='Country') + self.private_corpus.allowed_metadatas.create(type=MetaType.Text, name='Folio') + md_1 = self.page.metadatas.create( + type=MetaType.Location, + name='Country', + value='France', + worker_version=self.worker_version, + ) + md_2 = self.page.metadatas.create( + type=MetaType.Text, + name='Folio', + value='1', + worker_version=self.worker_version, + ) + indexer = Indexer(None) + self.assertListEqual(indexer.build_metadatas(self.page, {'key': 'value'}), [ + { + 'key': 'value', + 'metadata_text': md_1.value, + 'metadata_type': md_1.type.value, + 'metadata_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + }, + { + 'key': 'value', + 'metadata_text': md_2.value, + 'metadata_type': md_2.type.value, + 'metadata_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + } + ]) + + def test_build_entities(self): + self.private_corpus.allowed_metadatas.create(type=MetaType.Location, name='Country') + entity_1 = self.private_corpus.entities.create(name="CDLK", type=EntityType.Location, worker_version=self.worker_version) + tr = self.page.transcriptions.create( + confidence=0.8, + text='Transcription on the page', + ) + entity_1.transcription_entities.create( + transcription=tr, + offset=0, + length=len(entity_1.name) + ) + entity_2 = self.private_corpus.entities.create(name="Robert", type=EntityType.Person, worker_version=self.worker_version) + self.page.metadatas.create( + type=MetaType.Location, + name='Country', + value='France', + entity=entity_2 + ) + indexer = Indexer(None) + self.assertListEqual(indexer.build_entities(self.page, {'key': 'value'}), [ + { + 'key': 'value', + 'entity_text': entity_1.name, + 'entity_type': entity_1.type.value, + 'entity_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + }, + { + 'key': 'value', + 'entity_text': entity_2.name, + 'entity_type': entity_2.type.value, + 'entity_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + } + ]) + + @patch('arkindex.documents.indexer_v2.solr') + def test_index(self, mock_solr): + zone = Zone.objects.first() + self.page.zone = zone + self.page.save() + + entity = self.private_corpus.entities.create(name="CDLK", type=EntityType.Location, worker_version=self.worker_version) + tr = self.page.transcriptions.create( + confidence=0.8, + text='Transcription on the page', + ) + entity.transcription_entities.create( + transcription=tr, + offset=0, + length=len(entity.name) + ) + + indexer = Indexer(self.private_corpus.id) + with self.assertExactQueries('indexer_prefetch_v2.sql', params={ + 'corpus_id': self.private_corpus.id, + 'page_id': self.page.id, + 'zone_id': zone.id, + 'image_id': zone.image.id, + 'worker_version_id': self.worker_version.id, + 'worker_id': self.worker.id, + 'transcription_id': tr.id + }): + indexer.index() + self.assertEqual(mock_solr.index.call_count, 1) + (index_name, documents), kwargs = mock_solr.index.call_args + self.assertEqual(index_name, indexer.collection_name) + self.assertListEqual(documents, [ + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + 'element_image': zone.url, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + }, + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + 'element_image': zone.url, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'transcription_confidence': tr.confidence, + 'transcription_text': tr.text, + 'transcription_worker': None + }, + { + 'element_id': str(self.page.id), + 'element_text': self.page.name, + 'element_type': self.page.type.display_name, + 'element_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + 'element_image': zone.url, + 'parent_id': str(self.page.id), + 'parent_name': self.page.name, + 'parent_type': self.page.type.display_name, + 'entity_text': entity.name, + 'entity_type': entity.type.value, + 'entity_worker': f'{self.worker.id.hex[:10]}-{self.worker.name}', + }] + ) + self.assertDictEqual(kwargs, {'commit': True}) diff --git a/arkindex/project/config.py b/arkindex/project/config.py index 4701097388d69e3b553b29d08c8f1b13bb138ec9..e7c479657d35fc8fbabb01d74e49c24dffba9667 100644 --- a/arkindex/project/config.py +++ b/arkindex/project/config.py @@ -73,6 +73,9 @@ def get_settings_parser(base_dir): elasticsearch_parser = parser.add_subparser('elasticsearch', default={}) elasticsearch_parser.add_option('hosts', type=str, many=True, default=['localhost']) + solr_parser = parser.add_subparser('solr', default={}) + solr_parser.add_option('api_url', type=str, default='http://localhost:8983/solr/') + influxdb_parser = parser.add_subparser('influxdb', default={}) influxdb_parser.add_option('api_url', type=str, default='http://localhost:8086/') @@ -133,6 +136,7 @@ def get_settings_parser(base_dir): features_parser.add_option('signup', type=bool, default=True) features_parser.add_option('selection', type=bool, default=True) features_parser.add_option('search', type=bool, default=True) + features_parser.add_option('search_v2', type=bool, default=False) features_parser.add_option('transkribus', type=bool, default=True) features_parser.add_option('workers', type=bool, default=False) diff --git a/arkindex/project/settings.py b/arkindex/project/settings.py index c80afbb1a8b155139a24566a3a66b0e4094246b0..d063d71d6cf426de75b16b4b07444ba973fb9f55 100644 --- a/arkindex/project/settings.py +++ b/arkindex/project/settings.py @@ -274,6 +274,9 @@ SPECTACULAR_SETTINGS = { ] } +# Solr config +SOLR_API_URL = conf['solr']['api_url'] + SEARCH_FILTER_MAX_TERMS = 10 # Elastic search config diff --git a/arkindex/project/tests/config_samples/defaults.yaml b/arkindex/project/tests/config_samples/defaults.yaml index 817192f10db7773d05da95339940e48a8dfe470b..512057d754470c77340b0200f96bfb7e9744a4da 100644 --- a/arkindex/project/tests/config_samples/defaults.yaml +++ b/arkindex/project/tests/config_samples/defaults.yaml @@ -33,6 +33,7 @@ elasticsearch: email: null features: search: true + search_v2: false selection: true signup: true transkribus: true @@ -74,6 +75,8 @@ session: cookie_name: arkindex.auth cookie_samesite: lax cookie_secure: false +solr: + api_url: http://localhost:8983/solr/ static: cdn_assets_url: null frontend_version: null diff --git a/arkindex/project/tests/config_samples/override.yaml b/arkindex/project/tests/config_samples/override.yaml index b84bbdea03b4c821f219cc8c3bc66b8f6b2c6558..502757a00789bb52a13bf8cd3862ee55db0ee261 100644 --- a/arkindex/project/tests/config_samples/override.yaml +++ b/arkindex/project/tests/config_samples/override.yaml @@ -47,6 +47,7 @@ email: user: teklia@wanadoo.fr features: search: false + search_v2: true selection: false signup: false transkribus: false @@ -89,6 +90,8 @@ session: cookie_name: stonehenge cookie_samesite: false cookie_secure: true +solr: + api_url: http://nowhere/solr/ static: cdn_assets_url: http://cdn.teklia.horse/ frontend_version: 1.2.3-alpha4 diff --git a/arkindex/sql_validation/corpus_delete.sql b/arkindex/sql_validation/corpus_delete.sql index 6bf82198f5dbfff4c80d104415516fd3ca1bf9e0..4a6363710fe0990352e7f91e0b7c8882a7e7cd81 100644 --- a/arkindex/sql_validation/corpus_delete.sql +++ b/arkindex/sql_validation/corpus_delete.sql @@ -5,7 +5,8 @@ SELECT "documents_corpus"."created", "documents_corpus"."description", "documents_corpus"."repository_id", "documents_corpus"."top_level_type_id", - "documents_corpus"."public" + "documents_corpus"."public", + "documents_corpus"."indexable" FROM "documents_corpus" WHERE "documents_corpus"."id" = '{corpus_id}'::uuid LIMIT 21; diff --git a/arkindex/sql_validation/corpus_rights_filter.sql b/arkindex/sql_validation/corpus_rights_filter.sql index 56b054be6e53757c0a2ea178ccb7b7fbbb4c00bf..27f0840627ee8ddd6999e3b3adb246b2b5962909 100644 --- a/arkindex/sql_validation/corpus_rights_filter.sql +++ b/arkindex/sql_validation/corpus_rights_filter.sql @@ -22,6 +22,7 @@ SELECT "documents_corpus"."created", "documents_corpus"."repository_id", "documents_corpus"."top_level_type_id", "documents_corpus"."public", + "documents_corpus"."indexable", LEAST("users_right"."level", T5."level") AS "max_level" FROM "documents_corpus" INNER JOIN "users_right" ON ("documents_corpus"."id" = "users_right"."content_id" diff --git a/arkindex/sql_validation/corpus_rights_filter_public.sql b/arkindex/sql_validation/corpus_rights_filter_public.sql index 32c8379c26f1755f35bc1b433aa2cd064cd32a0c..302515a63c807a32d4456013b7e59d855f1c41b2 100644 --- a/arkindex/sql_validation/corpus_rights_filter_public.sql +++ b/arkindex/sql_validation/corpus_rights_filter_public.sql @@ -23,6 +23,7 @@ LIMIT 21; "documents_corpus"."repository_id", "documents_corpus"."top_level_type_id", "documents_corpus"."public", + "documents_corpus"."indexable", LEAST("users_right"."level", T5."level") AS "max_level" FROM "documents_corpus" INNER JOIN "users_right" ON ("documents_corpus"."id" = "users_right"."content_id" @@ -42,6 +43,7 @@ UNION "documents_corpus"."repository_id", "documents_corpus"."top_level_type_id", "documents_corpus"."public", + "documents_corpus"."indexable", 10 AS "max_level" FROM "documents_corpus" WHERE "documents_corpus"."public") diff --git a/arkindex/sql_validation/element_links.sql b/arkindex/sql_validation/element_links.sql index 12265310c6c4cc2896e6e79cd90a7d4ac5fc2fae..724bc98c29d8da50764a8156dfeaa8011f77c94d 100644 --- a/arkindex/sql_validation/element_links.sql +++ b/arkindex/sql_validation/element_links.sql @@ -13,7 +13,8 @@ SELECT "documents_element"."id", "documents_corpus"."description", "documents_corpus"."repository_id", "documents_corpus"."top_level_type_id", - "documents_corpus"."public" + "documents_corpus"."public", + "documents_corpus"."indexable" FROM "documents_element" INNER JOIN "documents_corpus" ON ("documents_element"."corpus_id" = "documents_corpus"."id") WHERE "documents_element"."id" = '{element_id}'::uuid diff --git a/arkindex/sql_validation/element_links_not_found.sql b/arkindex/sql_validation/element_links_not_found.sql index 78fc490178f93c8be0b2507d9fcb9f68cacdade3..172aae7e86f6e33f2e17c649ff91b6d131b6f26e 100644 --- a/arkindex/sql_validation/element_links_not_found.sql +++ b/arkindex/sql_validation/element_links_not_found.sql @@ -13,7 +13,8 @@ SELECT "documents_element"."id", "documents_corpus"."description", "documents_corpus"."repository_id", "documents_corpus"."top_level_type_id", - "documents_corpus"."public" + "documents_corpus"."public", + "documents_corpus"."indexable" FROM "documents_element" INNER JOIN "documents_corpus" ON ("documents_element"."corpus_id" = "documents_corpus"."id") WHERE "documents_element"."id" = '{element_id}'::uuid diff --git a/arkindex/sql_validation/indexer_prefetch_v2.sql b/arkindex/sql_validation/indexer_prefetch_v2.sql new file mode 100644 index 0000000000000000000000000000000000000000..1d34947b665d24ff6b6ef09d200fc42cce5d9ce6 --- /dev/null +++ b/arkindex/sql_validation/indexer_prefetch_v2.sql @@ -0,0 +1,154 @@ +SELECT parent.id as parent_id, + parent.name as parent_name, + parenttype.display_name as parent_type, + element.id as id, + element.name as name, + elementtype.display_name as type_name +FROM documents_element as element +LEFT OUTER JOIN documents_elementpath as elementpath ON (element.id = elementpath.element_id) +LEFT OUTER JOIN documents_element as parent ON (elementpath.path @ > ARRAY[parent.id] + OR element.id = parent.id) +INNER JOIN documents_elementtype parenttype ON (parent.type_id = parenttype.id) +INNER JOIN documents_elementtype elementtype ON (element.type_id = elementtype.id) +WHERE element.corpus_id = '{corpus_id}' + AND parenttype.indexable ; + +SELECT "documents_element"."id", + "documents_element"."zone_id" +FROM "documents_element" +WHERE "documents_element"."id" = '{page_id}'::uuid +LIMIT 21; + +SELECT "images_zone"."id", + "images_zone"."created", + "images_zone"."updated", + "images_zone"."image_id", + "images_zone"."polygon"::bytea +FROM "images_zone" +WHERE "images_zone"."id" IN ('{zone_id}'::uuid); + +SELECT "images_image"."id", + "images_image"."created", + "images_image"."updated", + "images_image"."server_id", + "images_image"."path", + "images_image"."width", + "images_image"."height", + "images_image"."hash", + "images_image"."status" +FROM "images_image" +WHERE "images_image"."id" IN ('{image_id}'::uuid); + +SELECT "images_imageserver"."id", + "images_imageserver"."display_name", + "images_imageserver"."url", + "images_imageserver"."s3_bucket", + "images_imageserver"."s3_region", + "images_imageserver"."max_width", + "images_imageserver"."max_height", + "images_imageserver"."created", + "images_imageserver"."updated", + "images_imageserver"."validated", + "images_imageserver"."read_only" +FROM "images_imageserver" +WHERE "images_imageserver"."id" IN (1); + +SELECT "documents_element"."id", + "documents_element"."worker_version_id" +FROM "documents_element" +WHERE "documents_element"."id" = '{page_id}'::uuid +LIMIT 21; + +SELECT "dataimport_workerversion"."id", + "dataimport_workerversion"."worker_id", + "dataimport_workerversion"."revision_id", + "dataimport_workerversion"."configuration", + "dataimport_workerversion"."state", + "dataimport_workerversion"."docker_image_id", + "dataimport_workerversion"."docker_image_iid" +FROM "dataimport_workerversion" +WHERE "dataimport_workerversion"."id" IN ('{worker_version_id}'::uuid); + +SELECT "dataimport_worker"."id", + "dataimport_worker"."name", + "dataimport_worker"."slug", + "dataimport_worker"."type", + "dataimport_worker"."repository_id", + "dataimport_worker"."public" +FROM "dataimport_worker" +WHERE "dataimport_worker"."id" IN ('{worker_id}'::uuid); + +SELECT "documents_transcription"."id", + "documents_transcription"."element_id", + "documents_transcription"."worker_version_id", + "documents_transcription"."text", + "documents_transcription"."confidence" +FROM "documents_transcription" +WHERE "documents_transcription"."element_id" IN ('{page_id}'::uuid); + +SELECT "dataimport_workerversion"."id", + "dataimport_workerversion"."worker_id", + "dataimport_workerversion"."revision_id", + "dataimport_workerversion"."configuration", + "dataimport_workerversion"."state", + "dataimport_workerversion"."docker_image_id", + "dataimport_workerversion"."docker_image_iid" +FROM "dataimport_workerversion" +WHERE "dataimport_workerversion"."id" IN (NULL); + +SELECT "documents_classification"."id", + "documents_classification"."element_id", + "documents_classification"."worker_version_id", + "documents_classification"."moderator_id", + "documents_classification"."ml_class_id", + "documents_classification"."high_confidence", + "documents_classification"."state", + "documents_classification"."confidence" +FROM "documents_classification" +WHERE "documents_classification"."element_id" IN ('{page_id}'::uuid); + +SELECT "documents_metadata"."id", + "documents_metadata"."element_id", + "documents_metadata"."name", + "documents_metadata"."type", + "documents_metadata"."value", + "documents_metadata"."index", + "documents_metadata"."entity_id", + "documents_metadata"."worker_version_id" +FROM "documents_metadata" +WHERE "documents_metadata"."element_id" IN ('{page_id}'::uuid) +ORDER BY "documents_metadata"."element_id" ASC, + "documents_metadata"."name" ASC, + "documents_metadata"."index" ASC; + +SELECT ("documents_transcriptionentity"."transcription_id") AS "_prefetch_related_val_transcription_id", + "documents_entity"."id", + "documents_entity"."name", + "documents_entity"."type", + "documents_entity"."corpus_id", + "documents_entity"."metas", + "documents_entity"."validated", + "documents_entity"."moderator_id", + "documents_entity"."worker_version_id" +FROM "documents_entity" +INNER JOIN "documents_transcriptionentity" ON ("documents_entity"."id" = "documents_transcriptionentity"."entity_id") +WHERE "documents_transcriptionentity"."transcription_id" IN ('{transcription_id}'::uuid); + +SELECT "dataimport_workerversion"."id", + "dataimport_workerversion"."worker_id", + "dataimport_workerversion"."revision_id", + "dataimport_workerversion"."configuration", + "dataimport_workerversion"."state", + "dataimport_workerversion"."docker_image_id", + "dataimport_workerversion"."docker_image_iid" +FROM "dataimport_workerversion" +WHERE "dataimport_workerversion"."id" IN ('{worker_version_id}'::uuid); + +SELECT "dataimport_worker"."id", + "dataimport_worker"."name", + "dataimport_worker"."slug", + "dataimport_worker"."type", + "dataimport_worker"."repository_id", + "dataimport_worker"."public" +FROM "dataimport_worker" +WHERE "dataimport_worker"."id" IN ('{worker_id}'::uuid) \ No newline at end of file diff --git a/arkindex/sql_validation/list_elements.sql b/arkindex/sql_validation/list_elements.sql index dca62a2f17aa20e8ad8b7502c64b13e6320c0a4f..5fbad4ccdd3a114022b039ab2af180e096de9979 100644 --- a/arkindex/sql_validation/list_elements.sql +++ b/arkindex/sql_validation/list_elements.sql @@ -5,7 +5,8 @@ SELECT "documents_corpus"."created", "documents_corpus"."description", "documents_corpus"."repository_id", "documents_corpus"."top_level_type_id", - "documents_corpus"."public" + "documents_corpus"."public", + "documents_corpus"."indexable" FROM "documents_corpus" WHERE "documents_corpus"."id" = '{corpus_id}'::uuid LIMIT 21; @@ -31,11 +32,13 @@ SELECT "documents_element"."id", "documents_corpus"."repository_id", "documents_corpus"."top_level_type_id", "documents_corpus"."public", + "documents_corpus"."indexable", "documents_elementtype"."id", "documents_elementtype"."corpus_id", "documents_elementtype"."slug", "documents_elementtype"."display_name", - "documents_elementtype"."folder" + "documents_elementtype"."folder", + "documents_elementtype"."indexable" FROM "documents_element" INNER JOIN "documents_corpus" ON ("documents_element"."corpus_id" = "documents_corpus"."id") INNER JOIN "documents_elementtype" ON ("documents_element"."type_id" = "documents_elementtype"."id") diff --git a/requirements.txt b/requirements.txt index 40ec288a87035ef87f1f99d37fab56e4acb9e437..b63687a5a0c6a5cb6828564e56e478fd8a9d3b1f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,7 @@ pytz==2021.1 PyYAML==5.4.1 requests==2.25.1 sentry-sdk==0.20.3 +SolrClient==0.3.1 teklia-toolbox==0.1.2 tenacity==6.3.1 transkribus-client>=0.1.1