From e7051d584f81a32a63861d5fc9be3488e2e4101f Mon Sep 17 00:00:00 2001
From: Yoann Schneider <yschneider@teklia.com>
Date: Tue, 10 Oct 2023 10:16:37 +0000
Subject: [PATCH] Allow element with no transcription

---
 dan/datasets/extract/__init__.py |  6 ++++++
 dan/datasets/extract/extract.py  |  7 ++++++-
 docs/usage/datasets/extract.md   |  1 +
 tests/test_extract.py            | 29 ++++++++++++++++++++++++++++-
 4 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py
index 8a4def01..80fc3de1 100644
--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
@@ -161,4 +161,10 @@ def add_extract_parser(subcommands) -> None:
         help="Do not remove beginning, ending and consecutive spaces in transcriptions.",
     )
 
+    parser.add_argument(
+        "--allow-empty",
+        action="store_true",
+        help="Also extract data from element with no transcription.",
+    )
+
     parser.set_defaults(func=run)
diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py
index e6847e0c..0251e654 100644
--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -74,6 +74,7 @@ class ArkindexExtractor:
         max_height: Optional[int] = None,
         keep_spaces: bool = False,
         image_extension: str = "",
+        allow_empty: bool = False,
     ) -> None:
         self.folders = folders
         self.element_type = element_type
@@ -87,7 +88,7 @@ class ArkindexExtractor:
         self.max_width = max_width
         self.max_height = max_height
         self.image_extension = image_extension
-
+        self.allow_empty = allow_empty
         self.keep_spaces = keep_spaces
 
         self.data: Dict = defaultdict(dict)
@@ -196,6 +197,8 @@ class ArkindexExtractor:
             element.id, self.transcription_worker_version
         )
         if len(transcriptions) == 0:
+            if self.allow_empty:
+                return ""
             raise NoTranscriptionError(element.id)
 
         transcription = random.choice(transcriptions)
@@ -425,6 +428,7 @@ def run(
     max_height: Optional[int],
     image_format: str,
     keep_spaces: bool,
+    allow_empty: bool,
 ):
     assert database.exists(), f"No file found @ {database}"
     open_database(path=database)
@@ -449,4 +453,5 @@ def run(
         max_height=max_height,
         keep_spaces=keep_spaces,
         image_extension=image_format,
+        allow_empty=allow_empty,
     ).run()
diff --git a/docs/usage/datasets/extract.md b/docs/usage/datasets/extract.md
index a7715d59..51729330 100644
--- a/docs/usage/datasets/extract.md
+++ b/docs/usage/datasets/extract.md
@@ -28,6 +28,7 @@ If an image download fails for whatever reason, it won't appear in the transcrip
 | `--max-height`                   | Images larger than this height will be resized to this height.                                                                                                                                                                       | `int`           |                                                    |
 | `--keep-spaces`                  | Transcriptions are trimmed by default. Use this flag to disable this behaviour.                                                                                                                                                      | `bool`          | False                                              |
 | `--image-format`                 | Images will be saved under this format.                                                                                                                                                                                              | `str`           | `.jpg`                                             |
+| `--allow-empty`                  | Elements with no transcriptions are skipped by default. This flag disables this behaviour.                                                                                                                                           | `bool`          | False                                              |
 
 The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.
 
diff --git a/tests/test_extract.py b/tests/test_extract.py
index 52981000..d23c2c73 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -12,7 +12,11 @@ import pytest
 from PIL import Image, ImageChops
 
 from arkindex_export import Element, Transcription
-from dan.datasets.extract.exceptions import NoEndTokenError, UnknownTokenInText
+from dan.datasets.extract.exceptions import (
+    NoEndTokenError,
+    NoTranscriptionError,
+    UnknownTokenInText,
+)
 from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor
 from dan.datasets.extract.utils import EntityType, insert_token, remove_spaces
 from dan.utils import parse_tokens
@@ -487,3 +491,26 @@ def test_download_image_error(iiif_url, caplog, capsys):
     # Check stdout
     captured = capsys.readouterr()
     assert captured.out == "deadbeef: Image URL must be HTTP(S)\n"
+
+
+@pytest.mark.parametrize("allow_empty", (True, False))
+def test_empty_transcription(allow_empty, mock_database):
+    extractor = ArkindexExtractor(
+        folders=["train", "val", "test"],
+        element_type=["text_line"],
+        parent_element_type="double_page",
+        output=None,
+        entity_separators=None,
+        tokens=None,
+        transcription_worker_version=None,
+        entity_worker_version=None,
+        keep_spaces=False,
+        image_extension=".jpg",
+        allow_empty=allow_empty,
+    )
+    element_no_transcription = Element(id="unknown")
+    if allow_empty:
+        assert extractor.extract_transcription(element_no_transcription) == ""
+    else:
+        with pytest.raises(NoTranscriptionError):
+            extractor.extract_transcription(element_no_transcription)
-- 
GitLab