diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index d8dcc1f6e13a8fb8414244a010c90fbf64db9830..12dff600a54c7b8768c973b8d759bae515a36911 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -106,9 +106,10 @@ class ArkindexExtractor: raise NoTranscriptionError(element.id) transcription = random.choice(transcriptions) + stripped_text = transcription.text.strip() if not self.tokens: - return transcription.text.strip() + return stripped_text entities = get_transcription_entities( transcription.id, @@ -116,6 +117,9 @@ class ArkindexExtractor: supported_types=list(self.tokens), ) + if not entities.count(): + return stripped_text + return self.translate( entities_to_xml( transcription.text, entities, entity_separators=self.entity_separators diff --git a/tests/test_extract.py b/tests/test_extract.py index 29132e863278ec7aaf3dc9a43cff96e4c7e7a8b0..1bf92c3ac1e318a51a15e421408e74d5db83e3eb 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -425,6 +425,32 @@ def test_empty_transcription(allow_empty, mock_database): extractor.extract_transcription(element_no_transcription) +@pytest.mark.parametrize("tokens", (None, EXTRACTION_DATA_PATH / "tokens.yml")) +def test_extract_transcription_no_translation(mock_database, tokens): + extractor = ArkindexExtractor( + element_type=["text_line"], + entity_separators=None, + tokens=tokens, + ) + + element = Element.get_by_id("test-page_1-line_1") + # Deleting one of the two transcriptions from the element + Transcription.get( + Transcription.element == element, + Transcription.worker_version_id == "worker_version_id", + ).delete_instance(recursive=True) + + # Deleting all entities on the element remaining transcription while leaving the transcription intact + if tokens: + TranscriptionEntity.delete().where( + TranscriptionEntity.transcription + == Transcription.select().where(Transcription.element == element).get() + ).execute() + + # Early return with only the element transcription text instead of a translation + assert extractor.extract_transcription(element) == "Coupez Bouis 7.12.14" + + @pytest.mark.parametrize( "nestation, xml_output, separators", (