Download images in subresolution using IIIF url

80017b86 · Yoann Schneider · Mélodie Boillet · fa3013b4 · 80017b86 · 80017b86
Commit 80017b86 authored 1 year ago by Yoann Schneider Committed by Mélodie Boillet 1 year ago
--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
@@ -146,4 +146,16 @@ def add_extract_parser(subcommands) -> None:
        help="Validation set split size.",
    )

+    parser.add_argument(
+        "--max-width",
+        type=int,
+        help="Images larger than this width will be resized to this width.",
+    )
+
+    parser.add_argument(
+        "--max-height",
+        type=int,
+        help="Images larger than this height will be resized to this width.",
+    )
+
    parser.set_defaults(func=run)
--- a/dan/datasets/extract/db.py
+++ b/dan/datasets/extract/db.py
 # -*- coding: utf-8 -*-

 import ast
+from dataclasses import dataclass
 from itertools import starmap
-from typing import List, NamedTuple, Union
+from typing import List, NamedTuple, Optional, Union
 from urllib.parse import urljoin

 from arkindex_export import Image
@@ -41,13 +42,21 @@ Entity = NamedTuple(
 )


-class Element(NamedTuple):
+@dataclass
+class Element:
    id: str
    type: str
    polygon: str
    url: str
-    width: str
-    height: str
+    width: int
+    height: int
+
+    max_width: Optional[int] = None
+    max_height: Optional[int] = None
+
+    def __post_init__(self):
+        self.max_height = self.max_height or self.height
+        self.max_width = self.max_width or self.width

    @property
    def bounding_box(self):
@@ -56,10 +65,18 @@ class Element(NamedTuple):
    @property
    def image_url(self):
        x, y, width, height = self.bounding_box
-        return urljoin(self.url + "/", f"{x},{y},{width},{height}/full/0/default.jpg")
+        return urljoin(
+            self.url + "/",
+            f"{x},{y},{width},{height}/!{self.max_width},{self.max_height}/0/default.jpg",
+        )


-def get_elements(parent_id: str, element_type: str) -> List[Element]:
+def get_elements(
+    parent_id: str,
+    element_type: str,
+    max_width: Optional[int] = None,
+    max_height: Optional[int] = None,
+) -> List[Element]:
    """
    Retrieve elements from an SQLite export of an Arkindex corpus
    """
@@ -77,10 +94,9 @@ def get_elements(parent_id: str, element_type: str) -> List[Element]:
            Image.height,
        )
    )
-
    return list(
        starmap(
-            Element,
+            lambda *x: Element(*x, max_width=max_width, max_height=max_height),
            query.tuples(),
        )
    )

--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -3,7 +3,7 @@
 import random
 from collections import defaultdict
 from pathlib import Path
-from typing import List, Union
+from typing import List, Optional, Union
 from uuid import UUID

 from arkindex_export import open_database
@@ -56,6 +56,8 @@ class ArkindexExtractor:
        entity_worker_version: str = None,
        train_prob: float = None,
        val_prob: float = None,
+        max_width: Optional[int] = None,
+        max_height: Optional[int] = None,
    ) -> None:
        self.element_type = element_type
        self.parent_element_type = parent_element_type
@@ -67,6 +69,8 @@ class ArkindexExtractor:
        self.entity_worker_version = entity_worker_version
        self.train_prob = train_prob
        self.val_prob = val_prob
+        self.max_width = max_width
+        self.max_height = max_height

        self.subsets = self.get_subsets(folders)

@@ -179,7 +183,12 @@ class ArkindexExtractor:
        # Extract children elements
        else:
            for element_type in self.element_type:
-                for element in get_elements(parent.id, element_type):
+                for element in get_elements(
+                    parent.id,
+                    element_type,
+                    max_width=self.max_width,
+                    max_height=self.max_height,
+                ):
                    try:
                        data[element_type].append(self.process_element(element, split))
                    except ProcessingError as e:
@@ -192,7 +201,12 @@ class ArkindexExtractor:
        for idx, subset in enumerate(self.subsets, start=1):
            # Iterate over the pages to create splits at page level.
            for parent in tqdm(
-                get_elements(subset.id, self.parent_element_type),
+                get_elements(
+                    subset.id,
+                    self.parent_element_type,
+                    max_width=self.max_width,
+                    max_height=self.max_height,
+                ),
                desc=f"Processing {subset} {idx}/{len(self.subsets)}",
            ):
                split = subset.split or self.get_random_split()
@@ -219,6 +233,8 @@ def run(
    entity_worker_version: Union[str, bool],
    train_prob,
    val_prob,
+    max_width: Optional[int],
+    max_height: Optional[int],
 ):
    assert (
        use_existing_split or parent
@@ -265,4 +281,6 @@ def run(
        entity_worker_version=entity_worker_version,
        train_prob=train_prob,
        val_prob=val_prob,
+        max_width=max_width,
+        max_height=max_height,
    ).run()
--- a/docs/usage/datasets/extract.md
+++ b/docs/usage/datasets/extract.md
@@ -21,6 +21,8 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind
 | `--entity-worker-version`        | Filter transcriptions entities by worker_version. Use `manual` for manual filtering | `str|uuid` |         |
 | `--train-prob`                   | Training set split size                                                             | `float`    | `0.7`     |
 | `--val-prob`                     | Validation set split size                                                           | `float`    | `0.15`    |
+| `--max-width`                     | Images larger than this width will be resized to this width.                                                           | `int`    |     |
+| `--max-height`                     | Images larger than this height will be resized to this height.                                                           | `int`    |     |

 The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.
 ```yaml