Allow element with no transcription

e7051d58 · Yoann Schneider · Manon Blanco · 47ec8226 · e7051d58 · e7051d58
Commit e7051d58 authored 1 year ago by Yoann Schneider Committed by Manon Blanco 1 year ago
--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
@@ -161,4 +161,10 @@ def add_extract_parser(subcommands) -> None:
        help="Do not remove beginning, ending and consecutive spaces in transcriptions.",
    )

+    parser.add_argument(
+        "--allow-empty",
+        action="store_true",
+        help="Also extract data from element with no transcription.",
+    )
+
    parser.set_defaults(func=run)
--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -74,6 +74,7 @@ class ArkindexExtractor:
        max_height: Optional[int] = None,
        keep_spaces: bool = False,
        image_extension: str = "",
+        allow_empty: bool = False,
    ) -> None:
        self.folders = folders
        self.element_type = element_type
@@ -87,7 +88,7 @@ class ArkindexExtractor:
        self.max_width = max_width
        self.max_height = max_height
        self.image_extension = image_extension
-
+        self.allow_empty = allow_empty
        self.keep_spaces = keep_spaces

        self.data: Dict = defaultdict(dict)
@@ -196,6 +197,8 @@ class ArkindexExtractor:
            element.id, self.transcription_worker_version
        )
        if len(transcriptions) == 0:
+            if self.allow_empty:
+                return ""
            raise NoTranscriptionError(element.id)

        transcription = random.choice(transcriptions)
@@ -425,6 +428,7 @@ def run(
    max_height: Optional[int],
    image_format: str,
    keep_spaces: bool,
+    allow_empty: bool,
 ):
    assert database.exists(), f"No file found @ {database}"
    open_database(path=database)
@@ -449,4 +453,5 @@ def run(
        max_height=max_height,
        keep_spaces=keep_spaces,
        image_extension=image_format,
+        allow_empty=allow_empty,
    ).run()
--- a/docs/usage/datasets/extract.md
+++ b/docs/usage/datasets/extract.md
@@ -28,6 +28,7 @@ If an image download fails for whatever reason, it won't appear in the transcrip
 | `--max-height`                   | Images larger than this height will be resized to this height.                                                                                                                                                                       | `int`           |                                                    |
 | `--keep-spaces`                  | Transcriptions are trimmed by default. Use this flag to disable this behaviour.                                                                                                                                                      | `bool`          | False                                              |
 | `--image-format`                 | Images will be saved under this format.                                                                                                                                                                                              | `str`           | `.jpg`                                             |
+| `--allow-empty`                  | Elements with no transcriptions are skipped by default. This flag disables this behaviour.                                                                                                                                           | `bool`          | False                                              |

 The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.


--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -12,7 +12,11 @@ import pytest
 from PIL import Image, ImageChops

 from arkindex_export import Element, Transcription
-from dan.datasets.extract.exceptions import NoEndTokenError, UnknownTokenInText
+from dan.datasets.extract.exceptions import (
+    NoEndTokenError,
+    NoTranscriptionError,
+    UnknownTokenInText,
+)
 from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor
 from dan.datasets.extract.utils import EntityType, insert_token, remove_spaces
 from dan.utils import parse_tokens
@@ -487,3 +491,26 @@ def test_download_image_error(iiif_url, caplog, capsys):
    # Check stdout
    captured = capsys.readouterr()
    assert captured.out == "deadbeef: Image URL must be HTTP(S)\n"
+
+
+@pytest.mark.parametrize("allow_empty", (True, False))
+def test_empty_transcription(allow_empty, mock_database):
+    extractor = ArkindexExtractor(
+        folders=["train", "val", "test"],
+        element_type=["text_line"],
+        parent_element_type="double_page",
+        output=None,
+        entity_separators=None,
+        tokens=None,
+        transcription_worker_version=None,
+        entity_worker_version=None,
+        keep_spaces=False,
+        image_extension=".jpg",
+        allow_empty=allow_empty,
+    )
+    element_no_transcription = Element(id="unknown")
+    if allow_empty:
+        assert extractor.extract_transcription(element_no_transcription) == ""
+    else:
+        with pytest.raises(NoTranscriptionError):
+            extractor.extract_transcription(element_no_transcription)