From 44929e5ae1884dc4a8f9b8446688df2091f90eea Mon Sep 17 00:00:00 2001
From: ml bonhomme <bonhomme@teklia.com>
Date: Tue, 17 Dec 2024 12:42:33 +0000
Subject: [PATCH] Allow searching with classification names

---
 arkindex/documents/indexer.py                   |  2 ++
 arkindex/documents/serializers/search.py        |  6 ++++--
 .../documents/tests/commands/test_reindex.py    |  2 ++
 arkindex/documents/tests/test_indexer.py        |  2 ++
 arkindex/documents/tests/test_search_api.py     | 17 ++++++++++-------
 5 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/arkindex/documents/indexer.py b/arkindex/documents/indexer.py
index d90d4f9a47..3ef0188ad3 100644
--- a/arkindex/documents/indexer.py
+++ b/arkindex/documents/indexer.py
@@ -123,6 +123,7 @@ class Indexer:
         # Classification fields
         {"name": "classification_id", "indexed": False, "required": False, "type": "uuid"},
         {"name": "classification_name", "indexed": True, "required": False, "type": "full_string"},
+        {"name": "classification_text", "indexed": True, "required": False, "type": "string"},
         {"name": "classification_confidence", "indexed": True, "required": False, "type": "pfloat"},
         {"name": "classification_worker", "indexed": True, "required": False, "type": "full_string"},
         # Metadata fields
@@ -269,6 +270,7 @@ class Indexer:
                 "id": str(self.build_solr_id(element, classification)),
                 "classification_id": str(classification.id),
                 "classification_name": classification.ml_class.name,
+                "classification_text": classification.ml_class.name,
                 "classification_confidence": classification.confidence,
                 "classification_worker": self.hash_worker(classification.worker_run)
             }) for classification in element.classifications.all()
diff --git a/arkindex/documents/serializers/search.py b/arkindex/documents/serializers/search.py
index 799ade083c..7a5d499f63 100644
--- a/arkindex/documents/serializers/search.py
+++ b/arkindex/documents/serializers/search.py
@@ -32,6 +32,7 @@ class SolrDocumentSerializer(serializers.Serializer):
 
     classification_id = serializers.UUIDField(allow_null=True)
     classification_name = serializers.CharField(allow_null=True)
+    classification_text = serializers.CharField(allow_null=True)
     classification_confidence = serializers.FloatField(min_value=0, max_value=1, allow_null=True)
     classification_worker = serializers.CharField(allow_null=True)
 
@@ -102,9 +103,10 @@ class CorpusSearchQuerySerializer(serializers.Serializer):
             ("element", "element"),
             ("transcription", "transcription"),
             ("metadata", "metadata"),
-            ("entity", "entity")
+            ("entity", "entity"),
+            ("classification", "classification")
         ],
-        default={"element", "transcription", "metadata", "entity"},
+        default={"element", "transcription", "metadata", "entity", "classification"},
         help_text="List of sources to be searched on.",
     )
 
diff --git a/arkindex/documents/tests/commands/test_reindex.py b/arkindex/documents/tests/commands/test_reindex.py
index 9446a4dcb3..a3ce57de6e 100644
--- a/arkindex/documents/tests/commands/test_reindex.py
+++ b/arkindex/documents/tests/commands/test_reindex.py
@@ -251,6 +251,7 @@ class TestReindexCommand(FixtureTestCase):
                 "parent_type": self.page.type.display_name,
                 "classification_id": str(cl_1.id),
                 "classification_name": cl_1.ml_class.name,
+                "classification_text": cl_1.ml_class.name,
                 "classification_confidence": cl_1.confidence,
                 "classification_worker": self.worker.name,
             },
@@ -265,6 +266,7 @@ class TestReindexCommand(FixtureTestCase):
                 "parent_type": self.page.type.display_name,
                 "classification_id": str(cl_2.id),
                 "classification_name": cl_2.ml_class.name,
+                "classification_text": cl_2.ml_class.name,
                 "classification_confidence": cl_2.confidence,
                 "classification_worker": self.worker.name,
             }
diff --git a/arkindex/documents/tests/test_indexer.py b/arkindex/documents/tests/test_indexer.py
index 3a25eadf5a..8fd06cb9e4 100644
--- a/arkindex/documents/tests/test_indexer.py
+++ b/arkindex/documents/tests/test_indexer.py
@@ -181,6 +181,7 @@ class TestIndexerCommand(FixtureTestCase):
                 "id": str(indexer.build_solr_id(self.page, cl_1)),
                 "classification_id": str(cl_1.id),
                 "classification_name": cl_1.ml_class.name,
+                "classification_text": cl_1.ml_class.name,
                 "classification_confidence": cl_1.confidence,
                 "classification_worker": self.worker.name,
             },
@@ -189,6 +190,7 @@ class TestIndexerCommand(FixtureTestCase):
                 "id": str(indexer.build_solr_id(self.page, cl_2)),
                 "classification_id": str(cl_2.id),
                 "classification_name": cl_2.ml_class.name,
+                "classification_text": cl_2.ml_class.name,
                 "classification_confidence": cl_2.confidence,
                 "classification_worker": self.worker.name,
             }
diff --git a/arkindex/documents/tests/test_search_api.py b/arkindex/documents/tests/test_search_api.py
index 9aacaf4d34..cd08ee1708 100644
--- a/arkindex/documents/tests/test_search_api.py
+++ b/arkindex/documents/tests/test_search_api.py
@@ -121,11 +121,8 @@ class TestSearchApi(FixtureAPITestCase):
     @override_settings(ARKINDEX_FEATURES={"search": True})
     @patch("arkindex.documents.api.search.solr")
     def test_search(self, mock_solr):
+        self.maxDiff = None
         collection_name = f"project-{self.corpus.id}"
-        possible_queries = [
-            '(element_text:("I search" OR "Found") OR transcription_text:("I search" OR "Found")) AND (metadata_name:"folio" AND entity_type:"person")',
-            '(transcription_text:("I search" OR "Found") OR element_text:("I search" OR "Found")) AND (metadata_name:"folio" AND entity_type:"person")'
-        ]
         docs = [{
             "id": "document_id",
             "parent_id": "parent_id",
@@ -142,6 +139,7 @@ class TestSearchApi(FixtureAPITestCase):
             "transcription_worker": "1234567890_A worker",
             "classification_id": "classification_id",
             "classification_name": "my class",
+            "classification_text": "my class",
             "classification_confidence": 0.1,
             "classification_worker": "1234567890_A worker",
             "metadata_id": "metadata_id",
@@ -157,11 +155,12 @@ class TestSearchApi(FixtureAPITestCase):
 
         # Mock SolrClient
         mock_solr.collections.exists.return_value = True
-        solr_response = self.build_solr_response(docs=docs, query=possible_queries[0])
+        query = '(element_text:("I search" OR "Found") OR transcription_text:("I search" OR "Found") OR classification_text:("I search" OR "Found")) AND (metadata_name:"folio" AND entity_type:"person")'
+        solr_response = self.build_solr_response(docs=docs, query=query)
         mock_solr.query.return_value = solr_response
 
         payload = {
-            "sources[]": ["element", "transcription"],
+            "sources[]": ["element", "transcription", "classification"],
             "metadata_name": "folio",
             "entity_type": "person",
             "query": '"I search" OR "Found"',
@@ -182,7 +181,11 @@ class TestSearchApi(FixtureAPITestCase):
         self.assertEqual(index_name, collection_name)
         (index_name, args), kwargs = mock_solr.query.call_args
         self.assertEqual(index_name, collection_name)
-        self.assertIn(args.pop("q"), possible_queries)
+        # The order in which the sources appear in the query is random, so we just check that all the sources are there
+        q = args.pop("q")
+        self.assertIn('element_text:("I search" OR "Found")', q)
+        self.assertIn('transcription_text:("I search" OR "Found")', q)
+        self.assertIn('classification_text:("I search" OR "Found")', q)
         self.assertDictEqual(args, {
             "start": 0,
             "rows": 20,
-- 
GitLab