Skip to content
Snippets Groups Projects
Unverified Commit 126fc3e0 authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Allow no transcription unit test

parent 56f084d6
No related branches found
No related tags found
1 merge request!285Allow element with no transcription
This commit is part of merge request !285. Comments created here will be created in the context of that merge request.
......@@ -28,7 +28,7 @@ If an image download fails for whatever reason, it won't appear in the transcrip
| `--max-height` | Images larger than this height will be resized to this height. | `int` | |
| `--keep-spaces` | Transcriptions are trimmed by default. Use this flag to disable this behaviour. | `bool` | False |
| `--image-format` | Images will be saved under this format. | `str` | `.jpg` |
| `--allow-empty` | Elements with no transcriptions are skipped by default. This flag disables this behaviour. | `bool` | False |
| `--allow-empty` | Elements with no transcriptions are skipped by default. This flag disables this behaviour. | `bool` | False |
The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.
......
......@@ -12,7 +12,11 @@ import pytest
from PIL import Image, ImageChops
from arkindex_export import Element, Transcription
from dan.datasets.extract.exceptions import NoEndTokenError, UnknownTokenInText
from dan.datasets.extract.exceptions import (
NoEndTokenError,
NoTranscriptionError,
UnknownTokenInText,
)
from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor
from dan.datasets.extract.utils import EntityType, insert_token, remove_spaces
from dan.utils import parse_tokens
......@@ -487,3 +491,26 @@ def test_download_image_error(iiif_url, caplog, capsys):
# Check stdout
captured = capsys.readouterr()
assert captured.out == "deadbeef: Image URL must be HTTP(S)\n"
@pytest.mark.parametrize("allow_empty", (True, False))
def test_empty_transcription(allow_empty, mock_database):
extractor = ArkindexExtractor(
folders=["train", "val", "test"],
element_type=["text_line"],
parent_element_type="double_page",
output=None,
entity_separators=None,
tokens=None,
transcription_worker_version=None,
entity_worker_version=None,
keep_spaces=False,
image_extension=".jpg",
allow_empty=allow_empty,
)
element_no_transcription = Element(id="unknown")
if allow_empty:
assert extractor.extract_transcription(element_no_transcription) == ""
else:
with pytest.raises(NoTranscriptionError):
extractor.extract_transcription(element_no_transcription)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment