Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (3)
# -*- coding: utf-8 -*-
import logging
from collections import Counter, defaultdict
from functools import partial
from pathlib import Path
from typing import Dict, List, Optional
......@@ -32,23 +33,32 @@ def create_table(
operations = []
if count:
operations.append(("Count", len))
operations.append(("Count", len, None))
operations.extend(
[
("Min", np.min),
("Max", np.max),
("Mean", np.mean),
("Median", np.median),
("Min", np.min, None),
("Max", np.max, None),
("Mean", np.mean, 2),
("Median", np.median, 2),
]
)
if total:
operations.append(("Total", np.sum))
operations.append(("Total", np.sum, None))
statistics.add_rows(
[
[col_name, *list(map(operator, data.values()))]
for col_name, operator in operations
[
col_name,
*list(
map(
# Round values if needed
partial(round, ndigits=digits),
map(operator, data.values()),
)
),
]
for col_name, operator, digits in operations
]
)
......
......@@ -161,4 +161,10 @@ def add_extract_parser(subcommands) -> None:
help="Do not remove beginning, ending and consecutive spaces in transcriptions.",
)
parser.add_argument(
"--allow-empty",
action="store_true",
help="Also extract data from element with no transcription.",
)
parser.set_defaults(func=run)
......@@ -74,6 +74,7 @@ class ArkindexExtractor:
max_height: Optional[int] = None,
keep_spaces: bool = False,
image_extension: str = "",
allow_empty: bool = False,
) -> None:
self.folders = folders
self.element_type = element_type
......@@ -87,7 +88,7 @@ class ArkindexExtractor:
self.max_width = max_width
self.max_height = max_height
self.image_extension = image_extension
self.allow_empty = allow_empty
self.keep_spaces = keep_spaces
self.data: Dict = defaultdict(dict)
......@@ -196,6 +197,8 @@ class ArkindexExtractor:
element.id, self.transcription_worker_version
)
if len(transcriptions) == 0:
if self.allow_empty:
return ""
raise NoTranscriptionError(element.id)
transcription = random.choice(transcriptions)
......@@ -425,6 +428,7 @@ def run(
max_height: Optional[int],
image_format: str,
keep_spaces: bool,
allow_empty: bool,
):
assert database.exists(), f"No file found @ {database}"
open_database(path=database)
......@@ -449,4 +453,5 @@ def run(
max_height=max_height,
keep_spaces=keep_spaces,
image_extension=image_format,
allow_empty=allow_empty,
).run()
# Get started
To use DAN in your own environment, you need to first clone with its submodules via:
To use DAN in your own environment, you need to first clone via:
```shell
git clone --recurse-submodules git@gitlab.teklia.com:atr/dan.git
```
If you forgot the `--recurse-submodules`, you can initialize the submodule using:
```shell
git submodule update --init
git clone git@gitlab.teklia.com:atr/dan.git
```
Then you can install it via pip:
......
......@@ -28,6 +28,7 @@ If an image download fails for whatever reason, it won't appear in the transcrip
| `--max-height` | Images larger than this height will be resized to this height. | `int` | |
| `--keep-spaces` | Transcriptions are trimmed by default. Use this flag to disable this behaviour. | `bool` | False |
| `--image-format` | Images will be saved under this format. | `str` | `.jpg` |
| `--allow-empty` | Elements with no transcriptions are skipped by default. This flag disables this behaviour. | `bool` | False |
The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.
......
......@@ -12,7 +12,11 @@ import pytest
from PIL import Image, ImageChops
from arkindex_export import Element, Transcription
from dan.datasets.extract.exceptions import NoEndTokenError, UnknownTokenInText
from dan.datasets.extract.exceptions import (
NoEndTokenError,
NoTranscriptionError,
UnknownTokenInText,
)
from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor
from dan.datasets.extract.utils import EntityType, insert_token, remove_spaces
from dan.utils import parse_tokens
......@@ -487,3 +491,26 @@ def test_download_image_error(iiif_url, caplog, capsys):
# Check stdout
captured = capsys.readouterr()
assert captured.out == "deadbeef: Image URL must be HTTP(S)\n"
@pytest.mark.parametrize("allow_empty", (True, False))
def test_empty_transcription(allow_empty, mock_database):
extractor = ArkindexExtractor(
folders=["train", "val", "test"],
element_type=["text_line"],
parent_element_type="double_page",
output=None,
entity_separators=None,
tokens=None,
transcription_worker_version=None,
entity_worker_version=None,
keep_spaces=False,
image_extension=".jpg",
allow_empty=allow_empty,
)
element_no_transcription = Element(id="unknown")
if allow_empty:
assert extractor.extract_transcription(element_no_transcription) == ""
else:
with pytest.raises(NoTranscriptionError):
extractor.extract_transcription(element_no_transcription)