Skip to content
Snippets Groups Projects
Commit e7051d58 authored by Yoann Schneider's avatar Yoann Schneider :tennis: Committed by Manon Blanco
Browse files

Allow element with no transcription

parent 47ec8226
No related branches found
No related tags found
1 merge request!285Allow element with no transcription
......@@ -161,4 +161,10 @@ def add_extract_parser(subcommands) -> None:
help="Do not remove beginning, ending and consecutive spaces in transcriptions.",
)
parser.add_argument(
"--allow-empty",
action="store_true",
help="Also extract data from element with no transcription.",
)
parser.set_defaults(func=run)
......@@ -74,6 +74,7 @@ class ArkindexExtractor:
max_height: Optional[int] = None,
keep_spaces: bool = False,
image_extension: str = "",
allow_empty: bool = False,
) -> None:
self.folders = folders
self.element_type = element_type
......@@ -87,7 +88,7 @@ class ArkindexExtractor:
self.max_width = max_width
self.max_height = max_height
self.image_extension = image_extension
self.allow_empty = allow_empty
self.keep_spaces = keep_spaces
self.data: Dict = defaultdict(dict)
......@@ -196,6 +197,8 @@ class ArkindexExtractor:
element.id, self.transcription_worker_version
)
if len(transcriptions) == 0:
if self.allow_empty:
return ""
raise NoTranscriptionError(element.id)
transcription = random.choice(transcriptions)
......@@ -425,6 +428,7 @@ def run(
max_height: Optional[int],
image_format: str,
keep_spaces: bool,
allow_empty: bool,
):
assert database.exists(), f"No file found @ {database}"
open_database(path=database)
......@@ -449,4 +453,5 @@ def run(
max_height=max_height,
keep_spaces=keep_spaces,
image_extension=image_format,
allow_empty=allow_empty,
).run()
......@@ -28,6 +28,7 @@ If an image download fails for whatever reason, it won't appear in the transcrip
| `--max-height` | Images larger than this height will be resized to this height. | `int` | |
| `--keep-spaces` | Transcriptions are trimmed by default. Use this flag to disable this behaviour. | `bool` | False |
| `--image-format` | Images will be saved under this format. | `str` | `.jpg` |
| `--allow-empty` | Elements with no transcriptions are skipped by default. This flag disables this behaviour. | `bool` | False |
The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.
......
......@@ -12,7 +12,11 @@ import pytest
from PIL import Image, ImageChops
from arkindex_export import Element, Transcription
from dan.datasets.extract.exceptions import NoEndTokenError, UnknownTokenInText
from dan.datasets.extract.exceptions import (
NoEndTokenError,
NoTranscriptionError,
UnknownTokenInText,
)
from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor
from dan.datasets.extract.utils import EntityType, insert_token, remove_spaces
from dan.utils import parse_tokens
......@@ -487,3 +491,26 @@ def test_download_image_error(iiif_url, caplog, capsys):
# Check stdout
captured = capsys.readouterr()
assert captured.out == "deadbeef: Image URL must be HTTP(S)\n"
@pytest.mark.parametrize("allow_empty", (True, False))
def test_empty_transcription(allow_empty, mock_database):
extractor = ArkindexExtractor(
folders=["train", "val", "test"],
element_type=["text_line"],
parent_element_type="double_page",
output=None,
entity_separators=None,
tokens=None,
transcription_worker_version=None,
entity_worker_version=None,
keep_spaces=False,
image_extension=".jpg",
allow_empty=allow_empty,
)
element_no_transcription = Element(id="unknown")
if allow_empty:
assert extractor.extract_transcription(element_no_transcription) == ""
else:
with pytest.raises(NoTranscriptionError):
extractor.extract_transcription(element_no_transcription)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment