From 4f880822e73197236c289c2c294a5f5c742766bd Mon Sep 17 00:00:00 2001
From: Eva Bardou <bardou@teklia.com>
Date: Tue, 23 Jan 2024 11:52:33 +0000
Subject: [PATCH] Allow extracting elements holding transcriptions without
 entities

---
 dan/datasets/extract/arkindex.py |  6 +++++-
 tests/test_extract.py            | 26 ++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py
index d8dcc1f6..12dff600 100644
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -106,9 +106,10 @@ class ArkindexExtractor:
             raise NoTranscriptionError(element.id)
 
         transcription = random.choice(transcriptions)
+        stripped_text = transcription.text.strip()
 
         if not self.tokens:
-            return transcription.text.strip()
+            return stripped_text
 
         entities = get_transcription_entities(
             transcription.id,
@@ -116,6 +117,9 @@ class ArkindexExtractor:
             supported_types=list(self.tokens),
         )
 
+        if not entities.count():
+            return stripped_text
+
         return self.translate(
             entities_to_xml(
                 transcription.text, entities, entity_separators=self.entity_separators
diff --git a/tests/test_extract.py b/tests/test_extract.py
index 29132e86..1bf92c3a 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -425,6 +425,32 @@ def test_empty_transcription(allow_empty, mock_database):
             extractor.extract_transcription(element_no_transcription)
 
 
+@pytest.mark.parametrize("tokens", (None, EXTRACTION_DATA_PATH / "tokens.yml"))
+def test_extract_transcription_no_translation(mock_database, tokens):
+    extractor = ArkindexExtractor(
+        element_type=["text_line"],
+        entity_separators=None,
+        tokens=tokens,
+    )
+
+    element = Element.get_by_id("test-page_1-line_1")
+    # Deleting one of the two transcriptions from the element
+    Transcription.get(
+        Transcription.element == element,
+        Transcription.worker_version_id == "worker_version_id",
+    ).delete_instance(recursive=True)
+
+    # Deleting all entities on the element remaining transcription while leaving the transcription intact
+    if tokens:
+        TranscriptionEntity.delete().where(
+            TranscriptionEntity.transcription
+            == Transcription.select().where(Transcription.element == element).get()
+        ).execute()
+
+    # Early return with only the element transcription text instead of a translation
+    assert extractor.extract_transcription(element) == "Coupez  Bouis  7.12.14"
+
+
 @pytest.mark.parametrize(
     "nestation, xml_output, separators",
     (
-- 
GitLab