Compare revisions

Yoann Schneider · Manon Blanco · Manon Blanco · Manon Blanco · Yoann Schneider · 04809033
--- a/dan/datasets/analyze/statistics.py
+++ b/dan/datasets/analyze/statistics.py
 # -*- coding: utf-8 -*-
 import logging
 from collections import Counter, defaultdict
+from functools import partial
 from pathlib import Path
 from typing import Dict, List, Optional
@@ -32,23 +33,32 @@ def create_table(
    operations = []
    if count:
-        operations.append(("Count", len))
+        operations.append(("Count", len, None))
    operations.extend(
        [
-            ("Min", np.min),
+            ("Min", np.min, None),
-            ("Max", np.max),
+            ("Max", np.max, None),
-            ("Mean", np.mean),
+            ("Mean", np.mean, 2),
-            ("Median", np.median),
+            ("Median", np.median, 2),
        ]
    )
    if total:
-        operations.append(("Total", np.sum))
+        operations.append(("Total", np.sum, None))
    statistics.add_rows(
        [
-            [col_name, *list(map(operator, data.values()))]
+            [
-            for col_name, operator in operations
+                col_name,
+                *list(
+                    map(
+                        # Round values if needed
+                        partial(round, ndigits=digits),
+                        map(operator, data.values()),
+                    )
+                ),
+            ]
+            for col_name, operator, digits in operations
        ]
    )

--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
@@ -161,4 +161,10 @@ def add_extract_parser(subcommands) -> None:
        help="Do not remove beginning, ending and consecutive spaces in transcriptions.",
    )
+    parser.add_argument(
+        "--allow-empty",
+        action="store_true",
+        help="Also extract data from element with no transcription.",
+    )
    parser.set_defaults(func=run)
--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -74,6 +74,7 @@ class ArkindexExtractor:
        max_height: Optional[int] = None,
        keep_spaces: bool = False,
        image_extension: str = "",
+        allow_empty: bool = False,
    ) -> None:
        self.folders = folders
        self.element_type = element_type
@@ -87,7 +88,7 @@ class ArkindexExtractor:
        self.max_width = max_width
        self.max_height = max_height
        self.image_extension = image_extension
+        self.allow_empty = allow_empty
        self.keep_spaces = keep_spaces
        self.data: Dict = defaultdict(dict)
@@ -196,6 +197,8 @@ class ArkindexExtractor:
            element.id, self.transcription_worker_version
        )
        if len(transcriptions) == 0:
+            if self.allow_empty:
+                return ""
            raise NoTranscriptionError(element.id)
        transcription = random.choice(transcriptions)
@@ -425,6 +428,7 @@ def run(
    max_height: Optional[int],
    image_format: str,
    keep_spaces: bool,
+    allow_empty: bool,
 ):
    assert database.exists(), f"No file found @ {database}"
    open_database(path=database)
@@ -449,4 +453,5 @@ def run(
        max_height=max_height,
        keep_spaces=keep_spaces,
        image_extension=image_format,
+        allow_empty=allow_empty,
    ).run()
--- a/docs/get_started/index.md
+++ b/docs/get_started/index.md
 # Get started
-To use DAN in your own environment, you need to first clone with its submodules via:
+To use DAN in your own environment, you need to first clone via:
 ```shell
-git clone --recurse-submodules git@gitlab.teklia.com:atr/dan.git
+git clone git@gitlab.teklia.com:atr/dan.git
-```
-If you forgot the `--recurse-submodules`, you can initialize the submodule using:
-```shell
-git submodule update --init
 ```
 Then you can install it via pip:

--- a/docs/usage/datasets/extract.md
+++ b/docs/usage/datasets/extract.md
@@ -28,6 +28,7 @@ If an image download fails for whatever reason, it won't appear in the transcrip
 | `--max-height`                   | Images larger than this height will be resized to this height.                                                                                                                                                                       | `int`           |                                                    |
 | `--keep-spaces`                  | Transcriptions are trimmed by default. Use this flag to disable this behaviour.                                                                                                                                                      | `bool`          | False                                              |
 | `--image-format`                 | Images will be saved under this format.                                                                                                                                                                                              | `str`           | `.jpg`                                             |
+| `--allow-empty`                  | Elements with no transcriptions are skipped by default. This flag disables this behaviour.                                                                                                                                           | `bool`          | False                                              |
 The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.

--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -12,7 +12,11 @@ import pytest
 from PIL import Image, ImageChops
 from arkindex_export import Element, Transcription
-from dan.datasets.extract.exceptions import NoEndTokenError, UnknownTokenInText
+from dan.datasets.extract.exceptions import (
+    NoEndTokenError,
+    NoTranscriptionError,
+    UnknownTokenInText,
+)
 from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor
 from dan.datasets.extract.utils import EntityType, insert_token, remove_spaces
 from dan.utils import parse_tokens
@@ -487,3 +491,26 @@ def test_download_image_error(iiif_url, caplog, capsys):
    # Check stdout
    captured = capsys.readouterr()
    assert captured.out == "deadbeef: Image URL must be HTTP(S)\n"
+@pytest.mark.parametrize("allow_empty", (True, False))
+def test_empty_transcription(allow_empty, mock_database):
+    extractor = ArkindexExtractor(
+        folders=["train", "val", "test"],
+        element_type=["text_line"],
+        parent_element_type="double_page",
+        output=None,
+        entity_separators=None,
+        tokens=None,
+        transcription_worker_version=None,
+        entity_worker_version=None,
+        keep_spaces=False,
+        image_extension=".jpg",
+        allow_empty=allow_empty,
+    )
+    element_no_transcription = Element(id="unknown")
+    if allow_empty:
+        assert extractor.extract_transcription(element_no_transcription) == ""
+    else:
+        with pytest.raises(NoTranscriptionError):
+            extractor.extract_transcription(element_no_transcription)
No results found