From e7051d584f81a32a63861d5fc9be3488e2e4101f Mon Sep 17 00:00:00 2001 From: Yoann Schneider <yschneider@teklia.com> Date: Tue, 10 Oct 2023 10:16:37 +0000 Subject: [PATCH] Allow element with no transcription --- dan/datasets/extract/__init__.py | 6 ++++++ dan/datasets/extract/extract.py | 7 ++++++- docs/usage/datasets/extract.md | 1 + tests/test_extract.py | 29 ++++++++++++++++++++++++++++- 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py index 8a4def01..80fc3de1 100644 --- a/dan/datasets/extract/__init__.py +++ b/dan/datasets/extract/__init__.py @@ -161,4 +161,10 @@ def add_extract_parser(subcommands) -> None: help="Do not remove beginning, ending and consecutive spaces in transcriptions.", ) + parser.add_argument( + "--allow-empty", + action="store_true", + help="Also extract data from element with no transcription.", + ) + parser.set_defaults(func=run) diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py index e6847e0c..0251e654 100644 --- a/dan/datasets/extract/extract.py +++ b/dan/datasets/extract/extract.py @@ -74,6 +74,7 @@ class ArkindexExtractor: max_height: Optional[int] = None, keep_spaces: bool = False, image_extension: str = "", + allow_empty: bool = False, ) -> None: self.folders = folders self.element_type = element_type @@ -87,7 +88,7 @@ class ArkindexExtractor: self.max_width = max_width self.max_height = max_height self.image_extension = image_extension - + self.allow_empty = allow_empty self.keep_spaces = keep_spaces self.data: Dict = defaultdict(dict) @@ -196,6 +197,8 @@ class ArkindexExtractor: element.id, self.transcription_worker_version ) if len(transcriptions) == 0: + if self.allow_empty: + return "" raise NoTranscriptionError(element.id) transcription = random.choice(transcriptions) @@ -425,6 +428,7 @@ def run( max_height: Optional[int], image_format: str, keep_spaces: bool, + allow_empty: bool, ): assert database.exists(), f"No file found @ {database}" open_database(path=database) @@ -449,4 +453,5 @@ def run( max_height=max_height, keep_spaces=keep_spaces, image_extension=image_format, + allow_empty=allow_empty, ).run() diff --git a/docs/usage/datasets/extract.md b/docs/usage/datasets/extract.md index a7715d59..51729330 100644 --- a/docs/usage/datasets/extract.md +++ b/docs/usage/datasets/extract.md @@ -28,6 +28,7 @@ If an image download fails for whatever reason, it won't appear in the transcrip | `--max-height` | Images larger than this height will be resized to this height. | `int` | | | `--keep-spaces` | Transcriptions are trimmed by default. Use this flag to disable this behaviour. | `bool` | False | | `--image-format` | Images will be saved under this format. | `str` | `.jpg` | +| `--allow-empty` | Elements with no transcriptions are skipped by default. This flag disables this behaviour. | `bool` | False | The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively. diff --git a/tests/test_extract.py b/tests/test_extract.py index 52981000..d23c2c73 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -12,7 +12,11 @@ import pytest from PIL import Image, ImageChops from arkindex_export import Element, Transcription -from dan.datasets.extract.exceptions import NoEndTokenError, UnknownTokenInText +from dan.datasets.extract.exceptions import ( + NoEndTokenError, + NoTranscriptionError, + UnknownTokenInText, +) from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor from dan.datasets.extract.utils import EntityType, insert_token, remove_spaces from dan.utils import parse_tokens @@ -487,3 +491,26 @@ def test_download_image_error(iiif_url, caplog, capsys): # Check stdout captured = capsys.readouterr() assert captured.out == "deadbeef: Image URL must be HTTP(S)\n" + + +@pytest.mark.parametrize("allow_empty", (True, False)) +def test_empty_transcription(allow_empty, mock_database): + extractor = ArkindexExtractor( + folders=["train", "val", "test"], + element_type=["text_line"], + parent_element_type="double_page", + output=None, + entity_separators=None, + tokens=None, + transcription_worker_version=None, + entity_worker_version=None, + keep_spaces=False, + image_extension=".jpg", + allow_empty=allow_empty, + ) + element_no_transcription = Element(id="unknown") + if allow_empty: + assert extractor.extract_transcription(element_no_transcription) == "" + else: + with pytest.raises(NoTranscriptionError): + extractor.extract_transcription(element_no_transcription) -- GitLab