From 4f880822e73197236c289c2c294a5f5c742766bd Mon Sep 17 00:00:00 2001 From: Eva Bardou <bardou@teklia.com> Date: Tue, 23 Jan 2024 11:52:33 +0000 Subject: [PATCH] Allow extracting elements holding transcriptions without entities --- dan/datasets/extract/arkindex.py | 6 +++++- tests/test_extract.py | 26 ++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index d8dcc1f6..12dff600 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -106,9 +106,10 @@ class ArkindexExtractor: raise NoTranscriptionError(element.id) transcription = random.choice(transcriptions) + stripped_text = transcription.text.strip() if not self.tokens: - return transcription.text.strip() + return stripped_text entities = get_transcription_entities( transcription.id, @@ -116,6 +117,9 @@ class ArkindexExtractor: supported_types=list(self.tokens), ) + if not entities.count(): + return stripped_text + return self.translate( entities_to_xml( transcription.text, entities, entity_separators=self.entity_separators diff --git a/tests/test_extract.py b/tests/test_extract.py index 29132e86..1bf92c3a 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -425,6 +425,32 @@ def test_empty_transcription(allow_empty, mock_database): extractor.extract_transcription(element_no_transcription) +@pytest.mark.parametrize("tokens", (None, EXTRACTION_DATA_PATH / "tokens.yml")) +def test_extract_transcription_no_translation(mock_database, tokens): + extractor = ArkindexExtractor( + element_type=["text_line"], + entity_separators=None, + tokens=tokens, + ) + + element = Element.get_by_id("test-page_1-line_1") + # Deleting one of the two transcriptions from the element + Transcription.get( + Transcription.element == element, + Transcription.worker_version_id == "worker_version_id", + ).delete_instance(recursive=True) + + # Deleting all entities on the element remaining transcription while leaving the transcription intact + if tokens: + TranscriptionEntity.delete().where( + TranscriptionEntity.transcription + == Transcription.select().where(Transcription.element == element).get() + ).execute() + + # Early return with only the element transcription text instead of a translation + assert extractor.extract_transcription(element) == "Coupez Bouis 7.12.14" + + @pytest.mark.parametrize( "nestation, xml_output, separators", ( -- GitLab