Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (3)
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import logging import logging
from collections import Counter, defaultdict from collections import Counter, defaultdict
from functools import partial
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional from typing import Dict, List, Optional
...@@ -32,23 +33,32 @@ def create_table( ...@@ -32,23 +33,32 @@ def create_table(
operations = [] operations = []
if count: if count:
operations.append(("Count", len)) operations.append(("Count", len, None))
operations.extend( operations.extend(
[ [
("Min", np.min), ("Min", np.min, None),
("Max", np.max), ("Max", np.max, None),
("Mean", np.mean), ("Mean", np.mean, 2),
("Median", np.median), ("Median", np.median, 2),
] ]
) )
if total: if total:
operations.append(("Total", np.sum)) operations.append(("Total", np.sum, None))
statistics.add_rows( statistics.add_rows(
[ [
[col_name, *list(map(operator, data.values()))] [
for col_name, operator in operations col_name,
*list(
map(
# Round values if needed
partial(round, ndigits=digits),
map(operator, data.values()),
)
),
]
for col_name, operator, digits in operations
] ]
) )
......
...@@ -161,4 +161,10 @@ def add_extract_parser(subcommands) -> None: ...@@ -161,4 +161,10 @@ def add_extract_parser(subcommands) -> None:
help="Do not remove beginning, ending and consecutive spaces in transcriptions.", help="Do not remove beginning, ending and consecutive spaces in transcriptions.",
) )
parser.add_argument(
"--allow-empty",
action="store_true",
help="Also extract data from element with no transcription.",
)
parser.set_defaults(func=run) parser.set_defaults(func=run)
...@@ -74,6 +74,7 @@ class ArkindexExtractor: ...@@ -74,6 +74,7 @@ class ArkindexExtractor:
max_height: Optional[int] = None, max_height: Optional[int] = None,
keep_spaces: bool = False, keep_spaces: bool = False,
image_extension: str = "", image_extension: str = "",
allow_empty: bool = False,
) -> None: ) -> None:
self.folders = folders self.folders = folders
self.element_type = element_type self.element_type = element_type
...@@ -87,7 +88,7 @@ class ArkindexExtractor: ...@@ -87,7 +88,7 @@ class ArkindexExtractor:
self.max_width = max_width self.max_width = max_width
self.max_height = max_height self.max_height = max_height
self.image_extension = image_extension self.image_extension = image_extension
self.allow_empty = allow_empty
self.keep_spaces = keep_spaces self.keep_spaces = keep_spaces
self.data: Dict = defaultdict(dict) self.data: Dict = defaultdict(dict)
...@@ -196,6 +197,8 @@ class ArkindexExtractor: ...@@ -196,6 +197,8 @@ class ArkindexExtractor:
element.id, self.transcription_worker_version element.id, self.transcription_worker_version
) )
if len(transcriptions) == 0: if len(transcriptions) == 0:
if self.allow_empty:
return ""
raise NoTranscriptionError(element.id) raise NoTranscriptionError(element.id)
transcription = random.choice(transcriptions) transcription = random.choice(transcriptions)
...@@ -425,6 +428,7 @@ def run( ...@@ -425,6 +428,7 @@ def run(
max_height: Optional[int], max_height: Optional[int],
image_format: str, image_format: str,
keep_spaces: bool, keep_spaces: bool,
allow_empty: bool,
): ):
assert database.exists(), f"No file found @ {database}" assert database.exists(), f"No file found @ {database}"
open_database(path=database) open_database(path=database)
...@@ -449,4 +453,5 @@ def run( ...@@ -449,4 +453,5 @@ def run(
max_height=max_height, max_height=max_height,
keep_spaces=keep_spaces, keep_spaces=keep_spaces,
image_extension=image_format, image_extension=image_format,
allow_empty=allow_empty,
).run() ).run()
# Get started # Get started
To use DAN in your own environment, you need to first clone with its submodules via: To use DAN in your own environment, you need to first clone via:
```shell ```shell
git clone --recurse-submodules git@gitlab.teklia.com:atr/dan.git git clone git@gitlab.teklia.com:atr/dan.git
```
If you forgot the `--recurse-submodules`, you can initialize the submodule using:
```shell
git submodule update --init
``` ```
Then you can install it via pip: Then you can install it via pip:
......
...@@ -28,6 +28,7 @@ If an image download fails for whatever reason, it won't appear in the transcrip ...@@ -28,6 +28,7 @@ If an image download fails for whatever reason, it won't appear in the transcrip
| `--max-height` | Images larger than this height will be resized to this height. | `int` | | | `--max-height` | Images larger than this height will be resized to this height. | `int` | |
| `--keep-spaces` | Transcriptions are trimmed by default. Use this flag to disable this behaviour. | `bool` | False | | `--keep-spaces` | Transcriptions are trimmed by default. Use this flag to disable this behaviour. | `bool` | False |
| `--image-format` | Images will be saved under this format. | `str` | `.jpg` | | `--image-format` | Images will be saved under this format. | `str` | `.jpg` |
| `--allow-empty` | Elements with no transcriptions are skipped by default. This flag disables this behaviour. | `bool` | False |
The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively. The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.
......
...@@ -12,7 +12,11 @@ import pytest ...@@ -12,7 +12,11 @@ import pytest
from PIL import Image, ImageChops from PIL import Image, ImageChops
from arkindex_export import Element, Transcription from arkindex_export import Element, Transcription
from dan.datasets.extract.exceptions import NoEndTokenError, UnknownTokenInText from dan.datasets.extract.exceptions import (
NoEndTokenError,
NoTranscriptionError,
UnknownTokenInText,
)
from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor
from dan.datasets.extract.utils import EntityType, insert_token, remove_spaces from dan.datasets.extract.utils import EntityType, insert_token, remove_spaces
from dan.utils import parse_tokens from dan.utils import parse_tokens
...@@ -487,3 +491,26 @@ def test_download_image_error(iiif_url, caplog, capsys): ...@@ -487,3 +491,26 @@ def test_download_image_error(iiif_url, caplog, capsys):
# Check stdout # Check stdout
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == "deadbeef: Image URL must be HTTP(S)\n" assert captured.out == "deadbeef: Image URL must be HTTP(S)\n"
@pytest.mark.parametrize("allow_empty", (True, False))
def test_empty_transcription(allow_empty, mock_database):
extractor = ArkindexExtractor(
folders=["train", "val", "test"],
element_type=["text_line"],
parent_element_type="double_page",
output=None,
entity_separators=None,
tokens=None,
transcription_worker_version=None,
entity_worker_version=None,
keep_spaces=False,
image_extension=".jpg",
allow_empty=allow_empty,
)
element_no_transcription = Element(id="unknown")
if allow_empty:
assert extractor.extract_transcription(element_no_transcription) == ""
else:
with pytest.raises(NoTranscriptionError):
extractor.extract_transcription(element_no_transcription)