diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py index 8a4def011e469acbbbfb42543042499f051e2562..80fc3de1daba7f2441f4ee4056885c71316c64a5 100644 --- a/dan/datasets/extract/__init__.py +++ b/dan/datasets/extract/__init__.py @@ -161,4 +161,10 @@ def add_extract_parser(subcommands) -> None: help="Do not remove beginning, ending and consecutive spaces in transcriptions.", ) + parser.add_argument( + "--allow-empty", + action="store_true", + help="Also extract data from element with no transcription.", + ) + parser.set_defaults(func=run) diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py index e6847e0c4b3a34996846a7fcd2e1bba0d94edb9f..0251e6545f7068e294886e8b2f38cf39b33e7a99 100644 --- a/dan/datasets/extract/extract.py +++ b/dan/datasets/extract/extract.py @@ -74,6 +74,7 @@ class ArkindexExtractor: max_height: Optional[int] = None, keep_spaces: bool = False, image_extension: str = "", + allow_empty: bool = False, ) -> None: self.folders = folders self.element_type = element_type @@ -87,7 +88,7 @@ class ArkindexExtractor: self.max_width = max_width self.max_height = max_height self.image_extension = image_extension - + self.allow_empty = allow_empty self.keep_spaces = keep_spaces self.data: Dict = defaultdict(dict) @@ -196,6 +197,8 @@ class ArkindexExtractor: element.id, self.transcription_worker_version ) if len(transcriptions) == 0: + if self.allow_empty: + return "" raise NoTranscriptionError(element.id) transcription = random.choice(transcriptions) @@ -425,6 +428,7 @@ def run( max_height: Optional[int], image_format: str, keep_spaces: bool, + allow_empty: bool, ): assert database.exists(), f"No file found @ {database}" open_database(path=database) @@ -449,4 +453,5 @@ def run( max_height=max_height, keep_spaces=keep_spaces, image_extension=image_format, + allow_empty=allow_empty, ).run() diff --git a/docs/usage/datasets/extract.md b/docs/usage/datasets/extract.md index a7715d59e07d1e776b05ff72a61b8d034e88dc4f..5172933059709b9eeab189893fecd1b52fc5a9dd 100644 --- a/docs/usage/datasets/extract.md +++ b/docs/usage/datasets/extract.md @@ -28,6 +28,7 @@ If an image download fails for whatever reason, it won't appear in the transcrip | `--max-height` | Images larger than this height will be resized to this height. | `int` | | | `--keep-spaces` | Transcriptions are trimmed by default. Use this flag to disable this behaviour. | `bool` | False | | `--image-format` | Images will be saved under this format. | `str` | `.jpg` | +| `--allow-empty` | Elements with no transcriptions are skipped by default. This flag disables this behaviour. | `bool` | False | The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively. diff --git a/tests/test_extract.py b/tests/test_extract.py index 529810005df89fc9390fb2e84045327e99d90b90..d23c2c733e9ad9673b8e081a873601cbaee060be 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -12,7 +12,11 @@ import pytest from PIL import Image, ImageChops from arkindex_export import Element, Transcription -from dan.datasets.extract.exceptions import NoEndTokenError, UnknownTokenInText +from dan.datasets.extract.exceptions import ( + NoEndTokenError, + NoTranscriptionError, + UnknownTokenInText, +) from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor from dan.datasets.extract.utils import EntityType, insert_token, remove_spaces from dan.utils import parse_tokens @@ -487,3 +491,26 @@ def test_download_image_error(iiif_url, caplog, capsys): # Check stdout captured = capsys.readouterr() assert captured.out == "deadbeef: Image URL must be HTTP(S)\n" + + +@pytest.mark.parametrize("allow_empty", (True, False)) +def test_empty_transcription(allow_empty, mock_database): + extractor = ArkindexExtractor( + folders=["train", "val", "test"], + element_type=["text_line"], + parent_element_type="double_page", + output=None, + entity_separators=None, + tokens=None, + transcription_worker_version=None, + entity_worker_version=None, + keep_spaces=False, + image_extension=".jpg", + allow_empty=allow_empty, + ) + element_no_transcription = Element(id="unknown") + if allow_empty: + assert extractor.extract_transcription(element_no_transcription) == "" + else: + with pytest.raises(NoTranscriptionError): + extractor.extract_transcription(element_no_transcription)