Merge branch 'extract-page-without-entities' into 'main'

Allow extracting elements holding transcriptions without entities Closes #254 See merge request !348

Merge branch 'extract-page-without-entities' into 'main'
6be3dcb0 · Yoann Schneider · 2e719c9e · 4f880822 · 6be3dcb0 · 6be3dcb0
Commit 6be3dcb0 authored 1 year ago by Yoann Schneider
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -106,9 +106,10 @@ class ArkindexExtractor:
            raise NoTranscriptionError(element.id)
        transcription = random.choice(transcriptions)
+        stripped_text = transcription.text.strip()
        if not self.tokens:
-            return transcription.text.strip()
+            return stripped_text
        entities = get_transcription_entities(
            transcription.id,
@@ -116,6 +117,9 @@ class ArkindexExtractor:
            supported_types=list(self.tokens),
        )
+        if not entities.count():
+            return stripped_text
        return self.translate(
            entities_to_xml(
                transcription.text, entities, entity_separators=self.entity_separators

--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -425,6 +425,32 @@ def test_empty_transcription(allow_empty, mock_database):
            extractor.extract_transcription(element_no_transcription)
+@pytest.mark.parametrize("tokens", (None, EXTRACTION_DATA_PATH / "tokens.yml"))
+def test_extract_transcription_no_translation(mock_database, tokens):
+    extractor = ArkindexExtractor(
+        element_type=["text_line"],
+        entity_separators=None,
+        tokens=tokens,
+    )
+    element = Element.get_by_id("test-page_1-line_1")
+    # Deleting one of the two transcriptions from the element
+    Transcription.get(
+        Transcription.element == element,
+        Transcription.worker_version_id == "worker_version_id",
+    ).delete_instance(recursive=True)
+    # Deleting all entities on the element remaining transcription while leaving the transcription intact
+    if tokens:
+        TranscriptionEntity.delete().where(
+            TranscriptionEntity.transcription
+            == Transcription.select().where(Transcription.element == element).get()
+        ).execute()
+    # Early return with only the element transcription text instead of a translation
+    assert extractor.extract_transcription(element) == "Coupez  Bouis  7.12.14"
 @pytest.mark.parametrize(
    "nestation, xml_output, separators",
    (